blob: 154103dfea312337306febd45bca7c23014ae46c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinner910337b2011-10-03 03:20:16 +0200107#undef PyUnicode_READY
108#define PyUnicode_READY(op) \
109 (assert(_PyUnicode_CHECK(op)), \
110 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200111 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100112 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200113
Victor Stinnerc379ead2011-10-03 12:52:27 +0200114#define _PyUnicode_SHARE_UTF8(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
117 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
118#define _PyUnicode_SHARE_WSTR(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
121
Victor Stinner829c0ad2011-10-03 01:08:02 +0200122/* true if the Unicode object has an allocated UTF-8 memory block
123 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200125 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200126 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
128
Victor Stinner03490912011-10-03 23:45:12 +0200129/* true if the Unicode object has an allocated wstr memory block
130 (not shared with other data) */
131#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200132 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200133 (!PyUnicode_IS_READY(op) || \
134 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
135
Victor Stinner910337b2011-10-03 03:20:16 +0200136/* Generic helper macro to convert characters of different types.
137 from_type and to_type have to be valid type names, begin and end
138 are pointers to the source characters which should be of type
139 "from_type *". to is a pointer of type "to_type *" and points to the
140 buffer where the result characters are written to. */
141#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
142 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200143 to_type *_to = (to_type *) to; \
144 const from_type *_iter = (begin); \
145 const from_type *_end = (end); \
146 Py_ssize_t n = (_end) - (_iter); \
147 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200148 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 while (_iter < (_unrolled_end)) { \
150 _to[0] = (to_type) _iter[0]; \
151 _to[1] = (to_type) _iter[1]; \
152 _to[2] = (to_type) _iter[2]; \
153 _to[3] = (to_type) _iter[3]; \
154 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200155 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_end)) \
157 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200158 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159
Walter Dörwald16807132007-05-25 13:52:07 +0000160/* This dictionary holds all interned unicode strings. Note that references
161 to strings in this dictionary are *not* counted in the string's ob_refcnt.
162 When the interned string reaches a refcnt of 0 the string deallocation
163 function will delete the reference from this dictionary.
164
165 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000166 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000167*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200168static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000169
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000170/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200171static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200172
Serhiy Storchaka678db842013-01-26 12:16:36 +0200173#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200174 do { \
175 if (unicode_empty != NULL) \
176 Py_INCREF(unicode_empty); \
177 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178 unicode_empty = PyUnicode_New(0, 0); \
179 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200180 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
182 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186#define _Py_RETURN_UNICODE_EMPTY() \
187 do { \
188 _Py_INCREF_UNICODE_EMPTY(); \
189 return unicode_empty; \
190 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200192/* Forward declaration */
193Py_LOCAL_INLINE(int)
194_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
195
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200196/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200197static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* Single character Unicode strings in the Latin-1 range are being
200 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202
Christian Heimes190d79e2008-01-30 11:58:22 +0000203/* Fast detection of the most frequent whitespace characters */
204const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000206/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000208/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x000C: * FORM FEED */
210/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 1, 1, 1, 1, 1, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x001C: * FILE SEPARATOR */
214/* case 0x001D: * GROUP SEPARATOR */
215/* case 0x001E: * RECORD SEPARATOR */
216/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 1, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000223
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000232};
233
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200234/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200235static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200236static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100237static int unicode_modifiable(PyObject *unicode);
238
Victor Stinnerfe226c02011-10-03 03:52:20 +0200239
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100241_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200242static PyObject *
243_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
244static PyObject *
245_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
246
247static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000248unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000249 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100250 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000251 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
252
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253static void
254raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300255 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100256 PyObject *unicode,
257 Py_ssize_t startpos, Py_ssize_t endpos,
258 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000259
Christian Heimes190d79e2008-01-30 11:58:22 +0000260/* Same for linebreaks */
261static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000262 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264/* 0x000B, * LINE TABULATION */
265/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x001C, * FILE SEPARATOR */
270/* 0x001D, * GROUP SEPARATOR */
271/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000272 0, 0, 0, 0, 1, 1, 1, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000277
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000286};
287
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300288/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
289 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000291PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000293#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 /* This is actually an illegal character, so it should
297 not be passed to unichr. */
298 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299#endif
300}
301
Victor Stinner910337b2011-10-03 03:20:16 +0200302#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200303int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100304_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200305{
306 PyASCIIObject *ascii;
307 unsigned int kind;
308
309 assert(PyUnicode_Check(op));
310
311 ascii = (PyASCIIObject *)op;
312 kind = ascii->state.kind;
313
Victor Stinnera3b334d2011-10-03 13:53:37 +0200314 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(ascii->state.ready == 1);
317 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200319 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200320 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200321
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 if (ascii->state.compact == 1) {
323 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200324 assert(kind == PyUnicode_1BYTE_KIND
325 || kind == PyUnicode_2BYTE_KIND
326 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 }
331 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
333
334 data = unicode->data.any;
335 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 assert(ascii->length == 0);
337 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.compact == 0);
339 assert(ascii->state.ascii == 0);
340 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 }
346 else {
347 assert(kind == PyUnicode_1BYTE_KIND
348 || kind == PyUnicode_2BYTE_KIND
349 || kind == PyUnicode_4BYTE_KIND);
350 assert(ascii->state.compact == 0);
351 assert(ascii->state.ready == 1);
352 assert(data != NULL);
353 if (ascii->state.ascii) {
354 assert (compact->utf8 == data);
355 assert (compact->utf8_length == ascii->length);
356 }
357 else
358 assert (compact->utf8 != data);
359 }
360 }
361 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200362 if (
363#if SIZEOF_WCHAR_T == 2
364 kind == PyUnicode_2BYTE_KIND
365#else
366 kind == PyUnicode_4BYTE_KIND
367#endif
368 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200369 {
370 assert(ascii->wstr == data);
371 assert(compact->wstr_length == ascii->length);
372 } else
373 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200375
376 if (compact->utf8 == NULL)
377 assert(compact->utf8_length == 0);
378 if (ascii->wstr == NULL)
379 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 /* check that the best kind is used */
382 if (check_content && kind != PyUnicode_WCHAR_KIND)
383 {
384 Py_ssize_t i;
385 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200386 void *data;
387 Py_UCS4 ch;
388
389 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 for (i=0; i < ascii->length; i++)
391 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 if (ch > maxchar)
394 maxchar = ch;
395 }
396 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100397 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 assert(maxchar <= 255);
400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 else
402 assert(maxchar < 128);
403 }
Victor Stinner77faf692011-11-20 18:56:05 +0100404 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 0xFFFF);
407 }
408 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100410 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200412 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200413 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400414 return 1;
415}
Victor Stinner910337b2011-10-03 03:20:16 +0200416#endif
417
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100418static PyObject*
419unicode_result_wchar(PyObject *unicode)
420{
421#ifndef Py_DEBUG
422 Py_ssize_t len;
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100426 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200427 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100432 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200440 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 return NULL;
442 }
443#else
Victor Stinneraa771272012-10-04 02:32:58 +0200444 assert(Py_REFCNT(unicode) == 1);
445
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 /* don't make the result ready in debug mode to ensure that the caller
447 makes the string ready before using it */
448 assert(_PyUnicode_CheckConsistency(unicode, 1));
449#endif
450 return unicode;
451}
452
453static PyObject*
454unicode_result_ready(PyObject *unicode)
455{
456 Py_ssize_t length;
457
458 length = PyUnicode_GET_LENGTH(unicode);
459 if (length == 0) {
460 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100461 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200462 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100463 }
464 return unicode_empty;
465 }
466
467 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200468 void *data = PyUnicode_DATA(unicode);
469 int kind = PyUnicode_KIND(unicode);
470 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100471 if (ch < 256) {
472 PyObject *latin1_char = unicode_latin1[ch];
473 if (latin1_char != NULL) {
474 if (unicode != latin1_char) {
475 Py_INCREF(latin1_char);
476 Py_DECREF(unicode);
477 }
478 return latin1_char;
479 }
480 else {
481 assert(_PyUnicode_CheckConsistency(unicode, 1));
482 Py_INCREF(unicode);
483 unicode_latin1[ch] = unicode;
484 return unicode;
485 }
486 }
487 }
488
489 assert(_PyUnicode_CheckConsistency(unicode, 1));
490 return unicode;
491}
492
493static PyObject*
494unicode_result(PyObject *unicode)
495{
496 assert(_PyUnicode_CHECK(unicode));
497 if (PyUnicode_IS_READY(unicode))
498 return unicode_result_ready(unicode);
499 else
500 return unicode_result_wchar(unicode);
501}
502
Victor Stinnerc4b49542011-12-11 22:44:26 +0100503static PyObject*
504unicode_result_unchanged(PyObject *unicode)
505{
506 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500507 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508 return NULL;
509 Py_INCREF(unicode);
510 return unicode;
511 }
512 else
513 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100514 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515}
516
Victor Stinner3a50e702011-10-18 21:21:00 +0200517#ifdef HAVE_MBCS
518static OSVERSIONINFOEX winver;
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521/* --- Bloom Filters ----------------------------------------------------- */
522
523/* stuff to implement simple "bloom filters" for Unicode characters.
524 to keep things simple, we use a single bitmask, using the least 5
525 bits from each unicode characters as the bit index. */
526
527/* the linebreak mask is set up by Unicode_Init below */
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#if LONG_BIT >= 128
530#define BLOOM_WIDTH 128
531#elif LONG_BIT >= 64
532#define BLOOM_WIDTH 64
533#elif LONG_BIT >= 32
534#define BLOOM_WIDTH 32
535#else
536#error "LONG_BIT is smaller than 32"
537#endif
538
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539#define BLOOM_MASK unsigned long
540
Serhiy Storchaka05997252013-01-26 12:14:02 +0200541static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542
Antoine Pitrouf068f942010-01-13 14:19:12 +0000543#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
Benjamin Peterson29060642009-01-31 22:14:21 +0000545#define BLOOM_LINEBREAK(ch) \
546 ((ch) < 128U ? ascii_linebreak[(ch)] : \
547 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
Alexander Belopolsky40018472011-02-26 01:02:56 +0000549Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551{
Victor Stinnera85af502013-04-09 21:53:54 +0200552#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
553 do { \
554 TYPE *data = (TYPE *)PTR; \
555 TYPE *end = data + LEN; \
556 Py_UCS4 ch; \
557 for (; data != end; data++) { \
558 ch = *data; \
559 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
560 } \
561 break; \
562 } while (0)
563
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564 /* calculate simple bloom-style bitmask for a given unicode string */
565
Antoine Pitrouf068f942010-01-13 14:19:12 +0000566 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567
568 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200569 switch (kind) {
570 case PyUnicode_1BYTE_KIND:
571 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
572 break;
573 case PyUnicode_2BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
575 break;
576 case PyUnicode_4BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
578 break;
579 default:
580 assert(0);
581 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000582 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200583
584#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585}
586
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200587/* Compilation of templated routines */
588
589#include "stringlib/asciilib.h"
590#include "stringlib/fastsearch.h"
591#include "stringlib/partition.h"
592#include "stringlib/split.h"
593#include "stringlib/count.h"
594#include "stringlib/find.h"
595#include "stringlib/find_max_char.h"
596#include "stringlib/localeutil.h"
597#include "stringlib/undef.h"
598
599#include "stringlib/ucs1lib.h"
600#include "stringlib/fastsearch.h"
601#include "stringlib/partition.h"
602#include "stringlib/split.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300605#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs2lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300616#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200617#include "stringlib/find_max_char.h"
618#include "stringlib/localeutil.h"
619#include "stringlib/undef.h"
620
621#include "stringlib/ucs4lib.h"
622#include "stringlib/fastsearch.h"
623#include "stringlib/partition.h"
624#include "stringlib/split.h"
625#include "stringlib/count.h"
626#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300627#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200628#include "stringlib/find_max_char.h"
629#include "stringlib/localeutil.h"
630#include "stringlib/undef.h"
631
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200632#include "stringlib/unicodedefs.h"
633#include "stringlib/fastsearch.h"
634#include "stringlib/count.h"
635#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100636#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638/* --- Unicode Object ----------------------------------------------------- */
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200641fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200643Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
644 Py_ssize_t size, Py_UCS4 ch,
645 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
648
649 switch (kind) {
650 case PyUnicode_1BYTE_KIND:
651 {
652 Py_UCS1 ch1 = (Py_UCS1) ch;
653 if (ch1 == ch)
654 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
655 else
656 return -1;
657 }
658 case PyUnicode_2BYTE_KIND:
659 {
660 Py_UCS2 ch2 = (Py_UCS2) ch;
661 if (ch2 == ch)
662 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
663 else
664 return -1;
665 }
666 case PyUnicode_4BYTE_KIND:
667 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
668 default:
669 assert(0);
670 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672}
673
Victor Stinnerafffce42012-10-03 23:03:17 +0200674#ifdef Py_DEBUG
675/* Fill the data of an Unicode string with invalid characters to detect bugs
676 earlier.
677
678 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
679 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
680 invalid character in Unicode 6.0. */
681static void
682unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
683{
684 int kind = PyUnicode_KIND(unicode);
685 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
686 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
687 if (length <= old_length)
688 return;
689 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
690}
691#endif
692
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693static PyObject*
694resize_compact(PyObject *unicode, Py_ssize_t length)
695{
696 Py_ssize_t char_size;
697 Py_ssize_t struct_size;
698 Py_ssize_t new_size;
699 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100700 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
703#endif
704
Victor Stinner79891572012-05-03 13:43:07 +0200705 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100707 assert(PyUnicode_IS_COMPACT(unicode));
708
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200709 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100710 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 struct_size = sizeof(PyASCIIObject);
712 else
713 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
717 PyErr_NoMemory();
718 return NULL;
719 }
720 new_size = (struct_size + (length + 1) * char_size);
721
Victor Stinner84def372011-12-11 20:04:56 +0100722 _Py_DEC_REFTOTAL;
723 _Py_ForgetReference(unicode);
724
725 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
726 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100727 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200728 PyErr_NoMemory();
729 return NULL;
730 }
Victor Stinner84def372011-12-11 20:04:56 +0100731 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100733
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100737 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 _PyUnicode_WSTR_LENGTH(unicode) = length;
739 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100740 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
741 PyObject_DEL(_PyUnicode_WSTR(unicode));
742 _PyUnicode_WSTR(unicode) = NULL;
743 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200744#ifdef Py_DEBUG
745 unicode_fill_invalid(unicode, old_length);
746#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
748 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 return unicode;
751}
752
Alexander Belopolsky40018472011-02-26 01:02:56 +0000753static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200754resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755{
Victor Stinner95663112011-10-04 01:03:50 +0200756 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000760
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 if (PyUnicode_IS_READY(unicode)) {
762 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200765#ifdef Py_DEBUG
766 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
767#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200770 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200771 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
772 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
775 PyErr_NoMemory();
776 return -1;
777 }
778 new_size = (length + 1) * char_size;
779
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
781 {
782 PyObject_DEL(_PyUnicode_UTF8(unicode));
783 _PyUnicode_UTF8(unicode) = NULL;
784 _PyUnicode_UTF8_LENGTH(unicode) = 0;
785 }
786
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 data = (PyObject *)PyObject_REALLOC(data, new_size);
788 if (data == NULL) {
789 PyErr_NoMemory();
790 return -1;
791 }
792 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200793 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200795 _PyUnicode_WSTR_LENGTH(unicode) = length;
796 }
797 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200798 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200799 _PyUnicode_UTF8_LENGTH(unicode) = length;
800 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200801 _PyUnicode_LENGTH(unicode) = length;
802 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200803#ifdef Py_DEBUG
804 unicode_fill_invalid(unicode, old_length);
805#endif
Victor Stinner95663112011-10-04 01:03:50 +0200806 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200807 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200809 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200810 }
Victor Stinner95663112011-10-04 01:03:50 +0200811 assert(_PyUnicode_WSTR(unicode) != NULL);
812
813 /* check for integer overflow */
814 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
815 PyErr_NoMemory();
816 return -1;
817 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100818 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200819 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100820 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200821 if (!wstr) {
822 PyErr_NoMemory();
823 return -1;
824 }
825 _PyUnicode_WSTR(unicode) = wstr;
826 _PyUnicode_WSTR(unicode)[length] = 0;
827 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200828 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 return 0;
830}
831
Victor Stinnerfe226c02011-10-03 03:52:20 +0200832static PyObject*
833resize_copy(PyObject *unicode, Py_ssize_t length)
834{
835 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100836 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100838
Benjamin Petersonbac79492012-01-14 13:34:47 -0500839 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100840 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200841
842 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
843 if (copy == NULL)
844 return NULL;
845
846 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200847 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200848 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200849 }
850 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200851 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100852
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200853 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200854 if (w == NULL)
855 return NULL;
856 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
857 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200858 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
859 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200860 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200861 }
862}
863
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000865 Ux0000 terminated; some code (e.g. new_identifier)
866 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
868 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000869 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871*/
872
Alexander Belopolsky40018472011-02-26 01:02:56 +0000873static PyUnicodeObject *
874_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200876 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880 if (length == 0 && unicode_empty != NULL) {
881 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200882 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 }
884
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000885 /* Ensure we won't overflow the size. */
886 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
887 return (PyUnicodeObject *)PyErr_NoMemory();
888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 if (length < 0) {
890 PyErr_SetString(PyExc_SystemError,
891 "Negative size passed to _PyUnicode_New");
892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
896 if (unicode == NULL)
897 return NULL;
898 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100899
900 _PyUnicode_WSTR_LENGTH(unicode) = length;
901 _PyUnicode_HASH(unicode) = -1;
902 _PyUnicode_STATE(unicode).interned = 0;
903 _PyUnicode_STATE(unicode).kind = 0;
904 _PyUnicode_STATE(unicode).compact = 0;
905 _PyUnicode_STATE(unicode).ready = 0;
906 _PyUnicode_STATE(unicode).ascii = 0;
907 _PyUnicode_DATA_ANY(unicode) = NULL;
908 _PyUnicode_LENGTH(unicode) = 0;
909 _PyUnicode_UTF8(unicode) = NULL;
910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
913 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100914 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000915 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100916 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918
Jeremy Hyltond8082792003-09-16 19:41:39 +0000919 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000920 * the caller fails before initializing str -- unicode_resize()
921 * reads str[0], and the Keep-Alive optimization can keep memory
922 * allocated for str alive across a call to unicode_dealloc(unicode).
923 * We don't want unicode_resize to read uninitialized memory in
924 * that case.
925 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926 _PyUnicode_WSTR(unicode)[0] = 0;
927 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100928
Victor Stinner7931d9a2011-11-04 00:22:48 +0100929 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000930 return unicode;
931}
932
Victor Stinnerf42dc442011-10-02 23:33:16 +0200933static const char*
934unicode_kind_name(PyObject *unicode)
935{
Victor Stinner42dfd712011-10-03 14:41:45 +0200936 /* don't check consistency: unicode_kind_name() is called from
937 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938 if (!PyUnicode_IS_COMPACT(unicode))
939 {
940 if (!PyUnicode_IS_READY(unicode))
941 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600942 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 {
944 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200945 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946 return "legacy ascii";
947 else
948 return "legacy latin1";
949 case PyUnicode_2BYTE_KIND:
950 return "legacy UCS2";
951 case PyUnicode_4BYTE_KIND:
952 return "legacy UCS4";
953 default:
954 return "<legacy invalid kind>";
955 }
956 }
957 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600958 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200959 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200960 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200961 return "ascii";
962 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200967 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200968 default:
969 return "<invalid compact kind>";
970 }
971}
972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974/* Functions wrapping macros for use in debugger */
975char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200976 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977}
978
979void *_PyUnicode_compact_data(void *unicode) {
980 return _PyUnicode_COMPACT_DATA(unicode);
981}
982void *_PyUnicode_data(void *unicode){
983 printf("obj %p\n", unicode);
984 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
985 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
986 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
987 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
988 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
989 return PyUnicode_DATA(unicode);
990}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200991
992void
993_PyUnicode_Dump(PyObject *op)
994{
995 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200996 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
997 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
998 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200999
Victor Stinnera849a4b2011-10-03 12:12:11 +02001000 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001001 {
1002 if (ascii->state.ascii)
1003 data = (ascii + 1);
1004 else
1005 data = (compact + 1);
1006 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001007 else
1008 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001009 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1010
Victor Stinnera849a4b2011-10-03 12:12:11 +02001011 if (ascii->wstr == data)
1012 printf("shared ");
1013 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001014
Victor Stinnera3b334d2011-10-03 13:53:37 +02001015 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 printf(" (%zu), ", compact->wstr_length);
1017 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1018 printf("shared ");
1019 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001020 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001022}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023#endif
1024
1025PyObject *
1026PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1027{
1028 PyObject *obj;
1029 PyCompactUnicodeObject *unicode;
1030 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001031 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001032 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035
1036 /* Optimization for empty strings */
1037 if (size == 0 && unicode_empty != NULL) {
1038 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001039 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 }
1041
Victor Stinner9e9d6892011-10-04 01:02:02 +02001042 is_ascii = 0;
1043 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 struct_size = sizeof(PyCompactUnicodeObject);
1045 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001046 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 char_size = 1;
1048 is_ascii = 1;
1049 struct_size = sizeof(PyASCIIObject);
1050 }
1051 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001052 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 char_size = 1;
1054 }
1055 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001056 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 char_size = 2;
1058 if (sizeof(wchar_t) == 2)
1059 is_sharing = 1;
1060 }
1061 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001062 if (maxchar > MAX_UNICODE) {
1063 PyErr_SetString(PyExc_SystemError,
1064 "invalid maximum character passed to PyUnicode_New");
1065 return NULL;
1066 }
Victor Stinner8f825062012-04-27 13:55:39 +02001067 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 char_size = 4;
1069 if (sizeof(wchar_t) == 4)
1070 is_sharing = 1;
1071 }
1072
1073 /* Ensure we won't overflow the size. */
1074 if (size < 0) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "Negative size passed to PyUnicode_New");
1077 return NULL;
1078 }
1079 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1080 return PyErr_NoMemory();
1081
1082 /* Duplicated allocation code from _PyObject_New() instead of a call to
1083 * PyObject_New() so we are able to allocate space for the object and
1084 * it's data buffer.
1085 */
1086 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1087 if (obj == NULL)
1088 return PyErr_NoMemory();
1089 obj = PyObject_INIT(obj, &PyUnicode_Type);
1090 if (obj == NULL)
1091 return NULL;
1092
1093 unicode = (PyCompactUnicodeObject *)obj;
1094 if (is_ascii)
1095 data = ((PyASCIIObject*)obj) + 1;
1096 else
1097 data = unicode + 1;
1098 _PyUnicode_LENGTH(unicode) = size;
1099 _PyUnicode_HASH(unicode) = -1;
1100 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001101 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 _PyUnicode_STATE(unicode).compact = 1;
1103 _PyUnicode_STATE(unicode).ready = 1;
1104 _PyUnicode_STATE(unicode).ascii = is_ascii;
1105 if (is_ascii) {
1106 ((char*)data)[size] = 0;
1107 _PyUnicode_WSTR(unicode) = NULL;
1108 }
Victor Stinner8f825062012-04-27 13:55:39 +02001109 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 ((char*)data)[size] = 0;
1111 _PyUnicode_WSTR(unicode) = NULL;
1112 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001114 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 else {
1117 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001118 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001119 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001121 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 ((Py_UCS4*)data)[size] = 0;
1123 if (is_sharing) {
1124 _PyUnicode_WSTR_LENGTH(unicode) = size;
1125 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1126 }
1127 else {
1128 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1129 _PyUnicode_WSTR(unicode) = NULL;
1130 }
1131 }
Victor Stinner8f825062012-04-27 13:55:39 +02001132#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001133 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001134#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001135 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136 return obj;
1137}
1138
1139#if SIZEOF_WCHAR_T == 2
1140/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1141 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001142 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
1144 This function assumes that unicode can hold one more code point than wstr
1145 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001146static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001148 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149{
1150 const wchar_t *iter;
1151 Py_UCS4 *ucs4_out;
1152
Victor Stinner910337b2011-10-03 03:20:16 +02001153 assert(unicode != NULL);
1154 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1156 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1157
1158 for (iter = begin; iter < end; ) {
1159 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1160 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001161 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1162 && (iter+1) < end
1163 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 {
Victor Stinner551ac952011-11-29 22:58:13 +01001165 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 iter += 2;
1167 }
1168 else {
1169 *ucs4_out++ = *iter;
1170 iter++;
1171 }
1172 }
1173 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1174 _PyUnicode_GET_LENGTH(unicode)));
1175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176}
1177#endif
1178
Victor Stinnercd9950f2011-10-02 00:34:53 +02001179static int
Victor Stinner488fa492011-12-12 00:01:39 +01001180unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001181{
Victor Stinner488fa492011-12-12 00:01:39 +01001182 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001183 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001184 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001185 return -1;
1186 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001187 return 0;
1188}
1189
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001190static int
1191_copy_characters(PyObject *to, Py_ssize_t to_start,
1192 PyObject *from, Py_ssize_t from_start,
1193 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001195 unsigned int from_kind, to_kind;
1196 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001197
Victor Stinneree4544c2012-05-09 22:24:08 +02001198 assert(0 <= how_many);
1199 assert(0 <= from_start);
1200 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001201 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204
Victor Stinnerd3f08822012-05-29 12:57:52 +02001205 assert(PyUnicode_Check(to));
1206 assert(PyUnicode_IS_READY(to));
1207 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1208
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001209 if (how_many == 0)
1210 return 0;
1211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001215 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216
Victor Stinnerf1852262012-06-16 16:38:26 +02001217#ifdef Py_DEBUG
1218 if (!check_maxchar
1219 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1220 {
1221 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1222 Py_UCS4 ch;
1223 Py_ssize_t i;
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 assert(ch <= to_maxchar);
1227 }
1228 }
1229#endif
1230
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001231 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001232 if (check_maxchar
1233 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1234 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 /* Writing Latin-1 characters into an ASCII string requires to
1236 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 Py_UCS4 max_char;
1238 max_char = ucs1lib_find_max_char(from_data,
1239 (Py_UCS1*)from_data + how_many);
1240 if (max_char >= 128)
1241 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001243 Py_MEMCPY((char*)to_data + to_kind * to_start,
1244 (char*)from_data + from_kind * from_start,
1245 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001247 else if (from_kind == PyUnicode_1BYTE_KIND
1248 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001249 {
1250 _PyUnicode_CONVERT_BYTES(
1251 Py_UCS1, Py_UCS2,
1252 PyUnicode_1BYTE_DATA(from) + from_start,
1253 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1254 PyUnicode_2BYTE_DATA(to) + to_start
1255 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001256 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001257 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001258 && to_kind == PyUnicode_4BYTE_KIND)
1259 {
1260 _PyUnicode_CONVERT_BYTES(
1261 Py_UCS1, Py_UCS4,
1262 PyUnicode_1BYTE_DATA(from) + from_start,
1263 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1264 PyUnicode_4BYTE_DATA(to) + to_start
1265 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001266 }
1267 else if (from_kind == PyUnicode_2BYTE_KIND
1268 && to_kind == PyUnicode_4BYTE_KIND)
1269 {
1270 _PyUnicode_CONVERT_BYTES(
1271 Py_UCS2, Py_UCS4,
1272 PyUnicode_2BYTE_DATA(from) + from_start,
1273 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1274 PyUnicode_4BYTE_DATA(to) + to_start
1275 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001276 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001277 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001278 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1279
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001280 if (!check_maxchar) {
1281 if (from_kind == PyUnicode_2BYTE_KIND
1282 && to_kind == PyUnicode_1BYTE_KIND)
1283 {
1284 _PyUnicode_CONVERT_BYTES(
1285 Py_UCS2, Py_UCS1,
1286 PyUnicode_2BYTE_DATA(from) + from_start,
1287 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1288 PyUnicode_1BYTE_DATA(to) + to_start
1289 );
1290 }
1291 else if (from_kind == PyUnicode_4BYTE_KIND
1292 && to_kind == PyUnicode_1BYTE_KIND)
1293 {
1294 _PyUnicode_CONVERT_BYTES(
1295 Py_UCS4, Py_UCS1,
1296 PyUnicode_4BYTE_DATA(from) + from_start,
1297 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1298 PyUnicode_1BYTE_DATA(to) + to_start
1299 );
1300 }
1301 else if (from_kind == PyUnicode_4BYTE_KIND
1302 && to_kind == PyUnicode_2BYTE_KIND)
1303 {
1304 _PyUnicode_CONVERT_BYTES(
1305 Py_UCS4, Py_UCS2,
1306 PyUnicode_4BYTE_DATA(from) + from_start,
1307 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1308 PyUnicode_2BYTE_DATA(to) + to_start
1309 );
1310 }
1311 else {
1312 assert(0);
1313 return -1;
1314 }
1315 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001316 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001317 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001318 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001319 Py_ssize_t i;
1320
Victor Stinnera0702ab2011-09-29 14:14:38 +02001321 for (i=0; i < how_many; i++) {
1322 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001323 if (ch > to_maxchar)
1324 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001325 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1326 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001327 }
1328 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001329 return 0;
1330}
1331
Victor Stinnerd3f08822012-05-29 12:57:52 +02001332void
1333_PyUnicode_FastCopyCharacters(
1334 PyObject *to, Py_ssize_t to_start,
1335 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001336{
1337 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1338}
1339
1340Py_ssize_t
1341PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1342 PyObject *from, Py_ssize_t from_start,
1343 Py_ssize_t how_many)
1344{
1345 int err;
1346
1347 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1348 PyErr_BadInternalCall();
1349 return -1;
1350 }
1351
Benjamin Petersonbac79492012-01-14 13:34:47 -05001352 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001353 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001354 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001355 return -1;
1356
Victor Stinnerd3f08822012-05-29 12:57:52 +02001357 if (from_start < 0) {
1358 PyErr_SetString(PyExc_IndexError, "string index out of range");
1359 return -1;
1360 }
1361 if (to_start < 0) {
1362 PyErr_SetString(PyExc_IndexError, "string index out of range");
1363 return -1;
1364 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1366 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1367 PyErr_Format(PyExc_SystemError,
1368 "Cannot write %zi characters at %zi "
1369 "in a string of %zi characters",
1370 how_many, to_start, PyUnicode_GET_LENGTH(to));
1371 return -1;
1372 }
1373
1374 if (how_many == 0)
1375 return 0;
1376
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001378 return -1;
1379
1380 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1381 if (err) {
1382 PyErr_Format(PyExc_SystemError,
1383 "Cannot copy %s characters "
1384 "into a string of %s characters",
1385 unicode_kind_name(from),
1386 unicode_kind_name(to));
1387 return -1;
1388 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390}
1391
Victor Stinner17222162011-09-28 22:15:37 +02001392/* Find the maximum code point and count the number of surrogate pairs so a
1393 correct string length can be computed before converting a string to UCS4.
1394 This function counts single surrogates as a character and not as a pair.
1395
1396 Return 0 on success, or -1 on error. */
1397static int
1398find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1399 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400{
1401 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001402 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403
Victor Stinnerc53be962011-10-02 21:33:54 +02001404 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 *num_surrogates = 0;
1406 *maxchar = 0;
1407
1408 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001410 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1411 && (iter+1) < end
1412 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1413 {
1414 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1415 ++(*num_surrogates);
1416 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 }
1418 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001420 {
1421 ch = *iter;
1422 iter++;
1423 }
1424 if (ch > *maxchar) {
1425 *maxchar = ch;
1426 if (*maxchar > MAX_UNICODE) {
1427 PyErr_Format(PyExc_ValueError,
1428 "character U+%x is not in range [U+0000; U+10ffff]",
1429 ch);
1430 return -1;
1431 }
1432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 return 0;
1435}
1436
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001437int
1438_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439{
1440 wchar_t *end;
1441 Py_UCS4 maxchar = 0;
1442 Py_ssize_t num_surrogates;
1443#if SIZEOF_WCHAR_T == 2
1444 Py_ssize_t length_wo_surrogates;
1445#endif
1446
Georg Brandl7597add2011-10-05 16:36:47 +02001447 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001448 strings were created using _PyObject_New() and where no canonical
1449 representation (the str field) has been set yet aka strings
1450 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001451 assert(_PyUnicode_CHECK(unicode));
1452 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001454 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001455 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001456 /* Actually, it should neither be interned nor be anything else: */
1457 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001460 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
1464 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001465 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1466 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 PyErr_NoMemory();
1468 return -1;
1469 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001470 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 _PyUnicode_WSTR(unicode), end,
1472 PyUnicode_1BYTE_DATA(unicode));
1473 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1474 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1475 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1476 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001477 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001478 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001479 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 }
1481 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001483 _PyUnicode_UTF8(unicode) = NULL;
1484 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 PyObject_FREE(_PyUnicode_WSTR(unicode));
1487 _PyUnicode_WSTR(unicode) = NULL;
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 }
1490 /* In this case we might have to convert down from 4-byte native
1491 wchar_t to 2-byte unicode. */
1492 else if (maxchar < 65536) {
1493 assert(num_surrogates == 0 &&
1494 "FindMaxCharAndNumSurrogatePairs() messed up");
1495
Victor Stinner506f5922011-09-28 22:34:18 +02001496#if SIZEOF_WCHAR_T == 2
1497 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001498 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001499 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1500 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1501 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001502 _PyUnicode_UTF8(unicode) = NULL;
1503 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001504#else
1505 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001507 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001508 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001509 PyErr_NoMemory();
1510 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 }
Victor Stinner506f5922011-09-28 22:34:18 +02001512 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1513 _PyUnicode_WSTR(unicode), end,
1514 PyUnicode_2BYTE_DATA(unicode));
1515 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1516 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1517 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001518 _PyUnicode_UTF8(unicode) = NULL;
1519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001520 PyObject_FREE(_PyUnicode_WSTR(unicode));
1521 _PyUnicode_WSTR(unicode) = NULL;
1522 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1523#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 }
1525 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1526 else {
1527#if SIZEOF_WCHAR_T == 2
1528 /* in case the native representation is 2-bytes, we need to allocate a
1529 new normalized 4-byte version. */
1530 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001531 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1532 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 PyErr_NoMemory();
1534 return -1;
1535 }
1536 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1537 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001538 _PyUnicode_UTF8(unicode) = NULL;
1539 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001540 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1541 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001542 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543 PyObject_FREE(_PyUnicode_WSTR(unicode));
1544 _PyUnicode_WSTR(unicode) = NULL;
1545 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1546#else
1547 assert(num_surrogates == 0);
1548
Victor Stinnerc3c74152011-10-02 20:39:55 +02001549 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001551 _PyUnicode_UTF8(unicode) = NULL;
1552 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1554#endif
1555 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1556 }
1557 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001558 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 return 0;
1560}
1561
Alexander Belopolsky40018472011-02-26 01:02:56 +00001562static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001563unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564{
Walter Dörwald16807132007-05-25 13:52:07 +00001565 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001566 case SSTATE_NOT_INTERNED:
1567 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001568
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 case SSTATE_INTERNED_MORTAL:
1570 /* revive dead object temporarily for DelItem */
1571 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001572 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 Py_FatalError(
1574 "deletion of interned string failed");
1575 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001576
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 case SSTATE_INTERNED_IMMORTAL:
1578 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001579
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 default:
1581 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001582 }
1583
Victor Stinner03490912011-10-03 23:45:12 +02001584 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001586 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001587 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001588 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1589 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001591 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592}
1593
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001594#ifdef Py_DEBUG
1595static int
1596unicode_is_singleton(PyObject *unicode)
1597{
1598 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1599 if (unicode == unicode_empty)
1600 return 1;
1601 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1602 {
1603 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1604 if (ch < 256 && unicode_latin1[ch] == unicode)
1605 return 1;
1606 }
1607 return 0;
1608}
1609#endif
1610
Alexander Belopolsky40018472011-02-26 01:02:56 +00001611static int
Victor Stinner488fa492011-12-12 00:01:39 +01001612unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001613{
Victor Stinner488fa492011-12-12 00:01:39 +01001614 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 if (Py_REFCNT(unicode) != 1)
1616 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001617 if (_PyUnicode_HASH(unicode) != -1)
1618 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001619 if (PyUnicode_CHECK_INTERNED(unicode))
1620 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001621 if (!PyUnicode_CheckExact(unicode))
1622 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001623#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001624 /* singleton refcount is greater than 1 */
1625 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001626#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001627 return 1;
1628}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629
Victor Stinnerfe226c02011-10-03 03:52:20 +02001630static int
1631unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1632{
1633 PyObject *unicode;
1634 Py_ssize_t old_length;
1635
1636 assert(p_unicode != NULL);
1637 unicode = *p_unicode;
1638
1639 assert(unicode != NULL);
1640 assert(PyUnicode_Check(unicode));
1641 assert(0 <= length);
1642
Victor Stinner910337b2011-10-03 03:20:16 +02001643 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001644 old_length = PyUnicode_WSTR_LENGTH(unicode);
1645 else
1646 old_length = PyUnicode_GET_LENGTH(unicode);
1647 if (old_length == length)
1648 return 0;
1649
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001650 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001651 _Py_INCREF_UNICODE_EMPTY();
1652 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001653 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 Py_DECREF(*p_unicode);
1655 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001656 return 0;
1657 }
1658
Victor Stinner488fa492011-12-12 00:01:39 +01001659 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001660 PyObject *copy = resize_copy(unicode, length);
1661 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001663 Py_DECREF(*p_unicode);
1664 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001665 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666 }
1667
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001669 PyObject *new_unicode = resize_compact(unicode, length);
1670 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001671 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001672 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001674 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001675 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001676}
1677
Alexander Belopolsky40018472011-02-26 01:02:56 +00001678int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001679PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001680{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001681 PyObject *unicode;
1682 if (p_unicode == NULL) {
1683 PyErr_BadInternalCall();
1684 return -1;
1685 }
1686 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001687 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688 {
1689 PyErr_BadInternalCall();
1690 return -1;
1691 }
1692 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001693}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001694
Victor Stinnerc5166102012-02-22 13:55:02 +01001695/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001696
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001697 WARNING: The function doesn't copy the terminating null character and
1698 doesn't check the maximum character (may write a latin1 character in an
1699 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001700static void
1701unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1702 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001703{
1704 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1705 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001706 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001707
1708 switch (kind) {
1709 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001710 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001711#ifdef Py_DEBUG
1712 if (PyUnicode_IS_ASCII(unicode)) {
1713 Py_UCS4 maxchar = ucs1lib_find_max_char(
1714 (const Py_UCS1*)str,
1715 (const Py_UCS1*)str + len);
1716 assert(maxchar < 128);
1717 }
1718#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001719 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001720 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001721 }
1722 case PyUnicode_2BYTE_KIND: {
1723 Py_UCS2 *start = (Py_UCS2 *)data + index;
1724 Py_UCS2 *ucs2 = start;
1725 assert(index <= PyUnicode_GET_LENGTH(unicode));
1726
Victor Stinner184252a2012-06-16 02:57:41 +02001727 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 *ucs2 = (Py_UCS2)*str;
1729
1730 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001731 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 }
1733 default: {
1734 Py_UCS4 *start = (Py_UCS4 *)data + index;
1735 Py_UCS4 *ucs4 = start;
1736 assert(kind == PyUnicode_4BYTE_KIND);
1737 assert(index <= PyUnicode_GET_LENGTH(unicode));
1738
Victor Stinner184252a2012-06-16 02:57:41 +02001739 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001740 *ucs4 = (Py_UCS4)*str;
1741
1742 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001743 }
1744 }
1745}
1746
1747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748static PyObject*
1749get_latin1_char(unsigned char ch)
1750{
Victor Stinnera464fc12011-10-02 20:39:30 +02001751 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001753 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 if (!unicode)
1755 return NULL;
1756 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001757 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 unicode_latin1[ch] = unicode;
1759 }
1760 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001761 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Alexander Belopolsky40018472011-02-26 01:02:56 +00001764PyObject *
1765PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001767 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 Py_UCS4 maxchar = 0;
1769 Py_ssize_t num_surrogates;
1770
1771 if (u == NULL)
1772 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001774 /* If the Unicode data is known at construction time, we can apply
1775 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001778 if (size == 0)
1779 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 /* Single character Unicode objects in the Latin-1 range are
1782 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001783 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 return get_latin1_char((unsigned char)*u);
1785
1786 /* If not empty and not single character, copy the Unicode data
1787 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001788 if (find_maxchar_surrogates(u, u + size,
1789 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 return NULL;
1791
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 if (!unicode)
1794 return NULL;
1795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 switch (PyUnicode_KIND(unicode)) {
1797 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001798 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1800 break;
1801 case PyUnicode_2BYTE_KIND:
1802#if Py_UNICODE_SIZE == 2
1803 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1804#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001805 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1807#endif
1808 break;
1809 case PyUnicode_4BYTE_KIND:
1810#if SIZEOF_WCHAR_T == 2
1811 /* This is the only case which has to process surrogates, thus
1812 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001813 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814#else
1815 assert(num_surrogates == 0);
1816 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1817#endif
1818 break;
1819 default:
1820 assert(0 && "Impossible state");
1821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001823 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824}
1825
Alexander Belopolsky40018472011-02-26 01:02:56 +00001826PyObject *
1827PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001828{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001829 if (size < 0) {
1830 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001831 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 return NULL;
1833 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001834 if (u != NULL)
1835 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1836 else
1837 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001838}
1839
Alexander Belopolsky40018472011-02-26 01:02:56 +00001840PyObject *
1841PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001842{
1843 size_t size = strlen(u);
1844 if (size > PY_SSIZE_T_MAX) {
1845 PyErr_SetString(PyExc_OverflowError, "input too long");
1846 return NULL;
1847 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001848 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001849}
1850
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001851PyObject *
1852_PyUnicode_FromId(_Py_Identifier *id)
1853{
1854 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001855 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1856 strlen(id->string),
1857 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001858 if (!id->object)
1859 return NULL;
1860 PyUnicode_InternInPlace(&id->object);
1861 assert(!id->next);
1862 id->next = static_strings;
1863 static_strings = id;
1864 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001865 return id->object;
1866}
1867
1868void
1869_PyUnicode_ClearStaticStrings()
1870{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001871 _Py_Identifier *tmp, *s = static_strings;
1872 while (s) {
1873 Py_DECREF(s->object);
1874 s->object = NULL;
1875 tmp = s->next;
1876 s->next = NULL;
1877 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001878 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001879 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001880}
1881
Benjamin Peterson0df54292012-03-26 14:50:32 -04001882/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001883
Victor Stinnerd3f08822012-05-29 12:57:52 +02001884PyObject*
1885_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001886{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001887 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001888 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001889 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001890#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001891 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001892#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001893 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001894 }
Victor Stinner785938e2011-12-11 20:09:03 +01001895 unicode = PyUnicode_New(size, 127);
1896 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001897 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001898 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1899 assert(_PyUnicode_CheckConsistency(unicode, 1));
1900 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001901}
1902
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001903static Py_UCS4
1904kind_maxchar_limit(unsigned int kind)
1905{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001906 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907 case PyUnicode_1BYTE_KIND:
1908 return 0x80;
1909 case PyUnicode_2BYTE_KIND:
1910 return 0x100;
1911 case PyUnicode_4BYTE_KIND:
1912 return 0x10000;
1913 default:
1914 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001915 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001916 }
1917}
1918
Victor Stinnere6abb482012-05-02 01:15:40 +02001919Py_LOCAL_INLINE(Py_UCS4)
1920align_maxchar(Py_UCS4 maxchar)
1921{
1922 if (maxchar <= 127)
1923 return 127;
1924 else if (maxchar <= 255)
1925 return 255;
1926 else if (maxchar <= 65535)
1927 return 65535;
1928 else
1929 return MAX_UNICODE;
1930}
1931
Victor Stinner702c7342011-10-05 13:50:52 +02001932static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001933_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001937
Serhiy Storchaka678db842013-01-26 12:16:36 +02001938 if (size == 0)
1939 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001940 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001941 if (size == 1)
1942 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001943
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001945 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 if (!res)
1947 return NULL;
1948 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001949 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001951}
1952
Victor Stinnere57b1c02011-09-28 22:20:48 +02001953static PyObject*
1954_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955{
1956 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001957 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001958
Serhiy Storchaka678db842013-01-26 12:16:36 +02001959 if (size == 0)
1960 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001961 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001962 if (size == 1) {
1963 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001964 int kind;
1965 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001966 if (ch < 256)
1967 return get_latin1_char((unsigned char)ch);
1968
1969 res = PyUnicode_New(1, ch);
1970 if (res == NULL)
1971 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001972 kind = PyUnicode_KIND(res);
1973 data = PyUnicode_DATA(res);
1974 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001975 assert(_PyUnicode_CheckConsistency(res, 1));
1976 return res;
1977 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001979 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 if (!res)
1982 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001983 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001985 else {
1986 _PyUnicode_CONVERT_BYTES(
1987 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1988 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001989 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 return res;
1991}
1992
Victor Stinnere57b1c02011-09-28 22:20:48 +02001993static PyObject*
1994_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995{
1996 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998
Serhiy Storchaka678db842013-01-26 12:16:36 +02001999 if (size == 0)
2000 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002002 if (size == 1) {
2003 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002004 int kind;
2005 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002006 if (ch < 256)
2007 return get_latin1_char((unsigned char)ch);
2008
2009 res = PyUnicode_New(1, ch);
2010 if (res == NULL)
2011 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002012 kind = PyUnicode_KIND(res);
2013 data = PyUnicode_DATA(res);
2014 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002015 assert(_PyUnicode_CheckConsistency(res, 1));
2016 return res;
2017 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002019 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002020 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 if (!res)
2022 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002023 if (max_char < 256)
2024 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2025 PyUnicode_1BYTE_DATA(res));
2026 else if (max_char < 0x10000)
2027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2028 PyUnicode_2BYTE_DATA(res));
2029 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002031 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 return res;
2033}
2034
2035PyObject*
2036PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2037{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002038 if (size < 0) {
2039 PyErr_SetString(PyExc_ValueError, "size must be positive");
2040 return NULL;
2041 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002042 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002044 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002046 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002048 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002049 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002050 PyErr_SetString(PyExc_SystemError, "invalid kind");
2051 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053}
2054
Victor Stinnerece58de2012-04-23 23:36:38 +02002055Py_UCS4
2056_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2057{
2058 enum PyUnicode_Kind kind;
2059 void *startptr, *endptr;
2060
2061 assert(PyUnicode_IS_READY(unicode));
2062 assert(0 <= start);
2063 assert(end <= PyUnicode_GET_LENGTH(unicode));
2064 assert(start <= end);
2065
2066 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2067 return PyUnicode_MAX_CHAR_VALUE(unicode);
2068
2069 if (start == end)
2070 return 127;
2071
Victor Stinner94d558b2012-04-27 22:26:58 +02002072 if (PyUnicode_IS_ASCII(unicode))
2073 return 127;
2074
Victor Stinnerece58de2012-04-23 23:36:38 +02002075 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002076 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002077 endptr = (char *)startptr + end * kind;
2078 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002079 switch(kind) {
2080 case PyUnicode_1BYTE_KIND:
2081 return ucs1lib_find_max_char(startptr, endptr);
2082 case PyUnicode_2BYTE_KIND:
2083 return ucs2lib_find_max_char(startptr, endptr);
2084 case PyUnicode_4BYTE_KIND:
2085 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002086 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002087 assert(0);
2088 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 }
2090}
2091
Victor Stinner25a4b292011-10-06 12:31:55 +02002092/* Ensure that a string uses the most efficient storage, if it is not the
2093 case: create a new string with of the right kind. Write NULL into *p_unicode
2094 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002095static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002096unicode_adjust_maxchar(PyObject **p_unicode)
2097{
2098 PyObject *unicode, *copy;
2099 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002100 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002101 unsigned int kind;
2102
2103 assert(p_unicode != NULL);
2104 unicode = *p_unicode;
2105 assert(PyUnicode_IS_READY(unicode));
2106 if (PyUnicode_IS_ASCII(unicode))
2107 return;
2108
2109 len = PyUnicode_GET_LENGTH(unicode);
2110 kind = PyUnicode_KIND(unicode);
2111 if (kind == PyUnicode_1BYTE_KIND) {
2112 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002113 max_char = ucs1lib_find_max_char(u, u + len);
2114 if (max_char >= 128)
2115 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002116 }
2117 else if (kind == PyUnicode_2BYTE_KIND) {
2118 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002119 max_char = ucs2lib_find_max_char(u, u + len);
2120 if (max_char >= 256)
2121 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 }
2123 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002124 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002125 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002126 max_char = ucs4lib_find_max_char(u, u + len);
2127 if (max_char >= 0x10000)
2128 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002131 if (copy != NULL)
2132 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 Py_DECREF(unicode);
2134 *p_unicode = copy;
2135}
2136
Victor Stinner034f6cf2011-09-30 02:26:44 +02002137PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002138_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002139{
Victor Stinner87af4f22011-11-21 23:03:47 +01002140 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002141 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002142
Victor Stinner034f6cf2011-09-30 02:26:44 +02002143 if (!PyUnicode_Check(unicode)) {
2144 PyErr_BadInternalCall();
2145 return NULL;
2146 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002147 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002148 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149
Victor Stinner87af4f22011-11-21 23:03:47 +01002150 length = PyUnicode_GET_LENGTH(unicode);
2151 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152 if (!copy)
2153 return NULL;
2154 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2155
Victor Stinner87af4f22011-11-21 23:03:47 +01002156 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2157 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002158 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002159 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002160}
2161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163/* Widen Unicode objects to larger buffers. Don't write terminating null
2164 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165
2166void*
2167_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2168{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002169 Py_ssize_t len;
2170 void *result;
2171 unsigned int skind;
2172
Benjamin Petersonbac79492012-01-14 13:34:47 -05002173 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 return NULL;
2175
2176 len = PyUnicode_GET_LENGTH(s);
2177 skind = PyUnicode_KIND(s);
2178 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002179 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return NULL;
2181 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002182 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 case PyUnicode_2BYTE_KIND:
2184 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2185 if (!result)
2186 return PyErr_NoMemory();
2187 assert(skind == PyUnicode_1BYTE_KIND);
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS1, Py_UCS2,
2190 PyUnicode_1BYTE_DATA(s),
2191 PyUnicode_1BYTE_DATA(s) + len,
2192 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002194 case PyUnicode_4BYTE_KIND:
2195 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2196 if (!result)
2197 return PyErr_NoMemory();
2198 if (skind == PyUnicode_2BYTE_KIND) {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS4,
2201 PyUnicode_2BYTE_DATA(s),
2202 PyUnicode_2BYTE_DATA(s) + len,
2203 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002205 else {
2206 assert(skind == PyUnicode_1BYTE_KIND);
2207 _PyUnicode_CONVERT_BYTES(
2208 Py_UCS1, Py_UCS4,
2209 PyUnicode_1BYTE_DATA(s),
2210 PyUnicode_1BYTE_DATA(s) + len,
2211 result);
2212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002214 default:
2215 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 }
Victor Stinner01698042011-10-04 00:04:26 +02002217 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return NULL;
2219}
2220
2221static Py_UCS4*
2222as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2223 int copy_null)
2224{
2225 int kind;
2226 void *data;
2227 Py_ssize_t len, targetlen;
2228 if (PyUnicode_READY(string) == -1)
2229 return NULL;
2230 kind = PyUnicode_KIND(string);
2231 data = PyUnicode_DATA(string);
2232 len = PyUnicode_GET_LENGTH(string);
2233 targetlen = len;
2234 if (copy_null)
2235 targetlen++;
2236 if (!target) {
2237 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2238 PyErr_NoMemory();
2239 return NULL;
2240 }
2241 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2242 if (!target) {
2243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 }
2247 else {
2248 if (targetsize < targetlen) {
2249 PyErr_Format(PyExc_SystemError,
2250 "string is longer than the buffer");
2251 if (copy_null && 0 < targetsize)
2252 target[0] = 0;
2253 return NULL;
2254 }
2255 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 if (kind == PyUnicode_1BYTE_KIND) {
2257 Py_UCS1 *start = (Py_UCS1 *) data;
2258 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 else if (kind == PyUnicode_2BYTE_KIND) {
2261 Py_UCS2 *start = (Py_UCS2 *) data;
2262 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2263 }
2264 else {
2265 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 if (copy_null)
2269 target[len] = 0;
2270 return target;
2271}
2272
2273Py_UCS4*
2274PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2275 int copy_null)
2276{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002277 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 PyErr_BadInternalCall();
2279 return NULL;
2280 }
2281 return as_ucs4(string, target, targetsize, copy_null);
2282}
2283
2284Py_UCS4*
2285PyUnicode_AsUCS4Copy(PyObject *string)
2286{
2287 return as_ucs4(string, NULL, 0, 1);
2288}
2289
2290#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002291
Alexander Belopolsky40018472011-02-26 01:02:56 +00002292PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002293PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002297 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 PyErr_BadInternalCall();
2299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 }
2301
Martin v. Löwis790465f2008-04-05 20:41:37 +00002302 if (size == -1) {
2303 size = wcslen(w);
2304 }
2305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307}
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002310
Walter Dörwald346737f2007-05-31 10:44:43 +00002311static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002312makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002313 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002314{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002315 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002316 if (longflag)
2317 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002318 else if (longlongflag) {
2319 /* longlongflag should only ever be nonzero on machines with
2320 HAVE_LONG_LONG defined */
2321#ifdef HAVE_LONG_LONG
2322 char *f = PY_FORMAT_LONG_LONG;
2323 while (*f)
2324 *fmt++ = *f++;
2325#else
2326 /* we shouldn't ever get here */
2327 assert(0);
2328 *fmt++ = 'l';
2329#endif
2330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 else if (size_tflag) {
2332 char *f = PY_FORMAT_SIZE_T;
2333 while (*f)
2334 *fmt++ = *f++;
2335 }
2336 *fmt++ = c;
2337 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002338}
2339
Victor Stinner15a11362012-10-06 23:48:20 +02002340/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002341 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2342 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2343#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002344
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002345static int
2346unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2347 Py_ssize_t width, Py_ssize_t precision)
2348{
2349 Py_ssize_t length, fill, arglen;
2350 Py_UCS4 maxchar;
2351
2352 if (PyUnicode_READY(str) == -1)
2353 return -1;
2354
2355 length = PyUnicode_GET_LENGTH(str);
2356 if ((precision == -1 || precision >= length)
2357 && width <= length)
2358 return _PyUnicodeWriter_WriteStr(writer, str);
2359
2360 if (precision != -1)
2361 length = Py_MIN(precision, length);
2362
2363 arglen = Py_MAX(length, width);
2364 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2365 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2366 else
2367 maxchar = writer->maxchar;
2368
2369 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2370 return -1;
2371
2372 if (width > length) {
2373 fill = width - length;
2374 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2375 return -1;
2376 writer->pos += fill;
2377 }
2378
2379 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2380 str, 0, length);
2381 writer->pos += length;
2382 return 0;
2383}
2384
2385static int
2386unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2387 Py_ssize_t width, Py_ssize_t precision)
2388{
2389 /* UTF-8 */
2390 Py_ssize_t length;
2391 PyObject *unicode;
2392 int res;
2393
2394 length = strlen(str);
2395 if (precision != -1)
2396 length = Py_MIN(length, precision);
2397 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2398 if (unicode == NULL)
2399 return -1;
2400
2401 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2402 Py_DECREF(unicode);
2403 return res;
2404}
2405
Victor Stinner96865452011-03-01 23:44:09 +00002406static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002407unicode_fromformat_arg(_PyUnicodeWriter *writer,
2408 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002409{
Victor Stinnere215d962012-10-06 23:03:36 +02002410 const char *p;
2411 Py_ssize_t len;
2412 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 Py_ssize_t width;
2414 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002415 int longflag;
2416 int longlongflag;
2417 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002419
2420 p = f;
2421 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002422 zeropad = 0;
2423 if (*f == '0') {
2424 zeropad = 1;
2425 f++;
2426 }
Victor Stinner96865452011-03-01 23:44:09 +00002427
2428 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002429 width = -1;
2430 if (Py_ISDIGIT((unsigned)*f)) {
2431 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002432 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002433 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002434 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002435 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002436 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002437 return NULL;
2438 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002440 f++;
2441 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002442 }
2443 precision = -1;
2444 if (*f == '.') {
2445 f++;
2446 if (Py_ISDIGIT((unsigned)*f)) {
2447 precision = (*f - '0');
2448 f++;
2449 while (Py_ISDIGIT((unsigned)*f)) {
2450 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2451 PyErr_SetString(PyExc_ValueError,
2452 "precision too big");
2453 return NULL;
2454 }
2455 precision = (precision * 10) + (*f - '0');
2456 f++;
2457 }
2458 }
Victor Stinner96865452011-03-01 23:44:09 +00002459 if (*f == '%') {
2460 /* "%.3%s" => f points to "3" */
2461 f--;
2462 }
2463 }
2464 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002465 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002466 f--;
2467 }
Victor Stinner96865452011-03-01 23:44:09 +00002468
2469 /* Handle %ld, %lu, %lld and %llu. */
2470 longflag = 0;
2471 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002472 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002473 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002474 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002475 longflag = 1;
2476 ++f;
2477 }
2478#ifdef HAVE_LONG_LONG
2479 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002480 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002481 longlongflag = 1;
2482 f += 2;
2483 }
2484#endif
2485 }
2486 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002487 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002488 size_tflag = 1;
2489 ++f;
2490 }
Victor Stinnere215d962012-10-06 23:03:36 +02002491
2492 if (f[1] == '\0')
2493 writer->overallocate = 0;
2494
2495 switch (*f) {
2496 case 'c':
2497 {
2498 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002499 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002500 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002501 "character argument not in range(0x110000)");
2502 return NULL;
2503 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002504 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002505 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002506 break;
2507 }
2508
2509 case 'i':
2510 case 'd':
2511 case 'u':
2512 case 'x':
2513 {
2514 /* used by sprintf */
2515 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002516 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002517 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002518
2519 if (*f == 'u') {
2520 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2521
2522 if (longflag)
2523 len = sprintf(buffer, fmt,
2524 va_arg(*vargs, unsigned long));
2525#ifdef HAVE_LONG_LONG
2526 else if (longlongflag)
2527 len = sprintf(buffer, fmt,
2528 va_arg(*vargs, unsigned PY_LONG_LONG));
2529#endif
2530 else if (size_tflag)
2531 len = sprintf(buffer, fmt,
2532 va_arg(*vargs, size_t));
2533 else
2534 len = sprintf(buffer, fmt,
2535 va_arg(*vargs, unsigned int));
2536 }
2537 else if (*f == 'x') {
2538 makefmt(fmt, 0, 0, 0, 'x');
2539 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2540 }
2541 else {
2542 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2543
2544 if (longflag)
2545 len = sprintf(buffer, fmt,
2546 va_arg(*vargs, long));
2547#ifdef HAVE_LONG_LONG
2548 else if (longlongflag)
2549 len = sprintf(buffer, fmt,
2550 va_arg(*vargs, PY_LONG_LONG));
2551#endif
2552 else if (size_tflag)
2553 len = sprintf(buffer, fmt,
2554 va_arg(*vargs, Py_ssize_t));
2555 else
2556 len = sprintf(buffer, fmt,
2557 va_arg(*vargs, int));
2558 }
2559 assert(len >= 0);
2560
Victor Stinnere215d962012-10-06 23:03:36 +02002561 if (precision < len)
2562 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002563
2564 arglen = Py_MAX(precision, width);
2565 assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127);
2566 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2567 return NULL;
2568
Victor Stinnere215d962012-10-06 23:03:36 +02002569 if (width > precision) {
2570 Py_UCS4 fillchar;
2571 fill = width - precision;
2572 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002573 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2574 return NULL;
2575 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002576 }
Victor Stinner15a11362012-10-06 23:48:20 +02002577 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002578 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002579 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2580 return NULL;
2581 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002582 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583
2584 unicode_write_cstr(writer->buffer, writer->pos, buffer, len);
2585 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002586 break;
2587 }
2588
2589 case 'p':
2590 {
2591 char number[MAX_LONG_LONG_CHARS];
2592
2593 len = sprintf(number, "%p", va_arg(*vargs, void*));
2594 assert(len >= 0);
2595
2596 /* %p is ill-defined: ensure leading 0x. */
2597 if (number[1] == 'X')
2598 number[1] = 'x';
2599 else if (number[1] != 'x') {
2600 memmove(number + 2, number,
2601 strlen(number) + 1);
2602 number[0] = '0';
2603 number[1] = 'x';
2604 len += 2;
2605 }
2606
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002607 assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127);
2608 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002609 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002610 unicode_write_cstr(writer->buffer, writer->pos, number, len);
2611 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002612 break;
2613 }
2614
2615 case 's':
2616 {
2617 /* UTF-8 */
2618 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002620 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002621 break;
2622 }
2623
2624 case 'U':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 assert(obj && _PyUnicode_CHECK(obj));
2628
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002630 return NULL;
2631 break;
2632 }
2633
2634 case 'V':
2635 {
2636 PyObject *obj = va_arg(*vargs, PyObject *);
2637 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002638 if (obj) {
2639 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002640 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002641 return NULL;
2642 }
2643 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 assert(str != NULL);
2645 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002646 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002647 }
2648 break;
2649 }
2650
2651 case 'S':
2652 {
2653 PyObject *obj = va_arg(*vargs, PyObject *);
2654 PyObject *str;
2655 assert(obj);
2656 str = PyObject_Str(obj);
2657 if (!str)
2658 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002659 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002660 Py_DECREF(str);
2661 return NULL;
2662 }
2663 Py_DECREF(str);
2664 break;
2665 }
2666
2667 case 'R':
2668 {
2669 PyObject *obj = va_arg(*vargs, PyObject *);
2670 PyObject *repr;
2671 assert(obj);
2672 repr = PyObject_Repr(obj);
2673 if (!repr)
2674 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 Py_DECREF(repr);
2677 return NULL;
2678 }
2679 Py_DECREF(repr);
2680 break;
2681 }
2682
2683 case 'A':
2684 {
2685 PyObject *obj = va_arg(*vargs, PyObject *);
2686 PyObject *ascii;
2687 assert(obj);
2688 ascii = PyObject_ASCII(obj);
2689 if (!ascii)
2690 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002692 Py_DECREF(ascii);
2693 return NULL;
2694 }
2695 Py_DECREF(ascii);
2696 break;
2697 }
2698
2699 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002700 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002701 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002702 break;
2703
2704 default:
2705 /* if we stumble upon an unknown formatting code, copy the rest
2706 of the format string to the output string. (we cannot just
2707 skip the code, since there's no way to know what's in the
2708 argument list) */
2709 len = strlen(p);
2710 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2711 return NULL;
2712 f = p+len;
2713 return f;
2714 }
2715
2716 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002717 return f;
2718}
2719
Walter Dörwaldd2034312007-05-18 16:29:38 +00002720PyObject *
2721PyUnicode_FromFormatV(const char *format, va_list vargs)
2722{
Victor Stinnere215d962012-10-06 23:03:36 +02002723 va_list vargs2;
2724 const char *f;
2725 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002726
Victor Stinner8f674cc2013-04-17 23:02:17 +02002727 _PyUnicodeWriter_Init(&writer);
2728 writer.min_length = strlen(format) + 100;
2729 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002730
2731 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2732 Copy it to be able to pass a reference to a subfunction. */
2733 Py_VA_COPY(vargs2, vargs);
2734
2735 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002737 f = unicode_fromformat_arg(&writer, f, &vargs2);
2738 if (f == NULL)
2739 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 const char *p;
2743 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002744
Victor Stinnere215d962012-10-06 23:03:36 +02002745 p = f;
2746 do
2747 {
2748 if ((unsigned char)*p > 127) {
2749 PyErr_Format(PyExc_ValueError,
2750 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2751 "string, got a non-ASCII byte: 0x%02x",
2752 (unsigned char)*p);
2753 return NULL;
2754 }
2755 p++;
2756 }
2757 while (*p != '\0' && *p != '%');
2758 len = p - f;
2759
2760 if (*p == '\0')
2761 writer.overallocate = 0;
2762 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2763 goto fail;
2764 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2765 writer.pos += len;
2766
2767 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 }
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return _PyUnicodeWriter_Finish(&writer);
2771
2772 fail:
2773 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775}
2776
Walter Dörwaldd2034312007-05-18 16:29:38 +00002777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002780 PyObject* ret;
2781 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782
2783#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 ret = PyUnicode_FromFormatV(format, vargs);
2789 va_end(vargs);
2790 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002791}
2792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793#ifdef HAVE_WCHAR_H
2794
Victor Stinner5593d8a2010-10-02 11:11:27 +00002795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796 convert a Unicode object to a wide character string.
2797
Victor Stinnerd88d9832011-09-06 02:00:05 +02002798 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002799 character) required to convert the unicode object. Ignore size argument.
2800
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002803 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002806 wchar_t *w,
2807 Py_ssize_t size)
2808{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 const wchar_t *wstr;
2811
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (wstr == NULL)
2814 return -1;
2815
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002817 if (size > res)
2818 size = res + 1;
2819 else
2820 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002822 return res;
2823 }
2824 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002826}
2827
2828Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002829PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 wchar_t *w,
2831 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832{
2833 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 PyErr_BadInternalCall();
2835 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002837 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838}
2839
Victor Stinner137c34c2010-09-29 10:25:54 +00002840wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002841PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002842 Py_ssize_t *size)
2843{
2844 wchar_t* buffer;
2845 Py_ssize_t buflen;
2846
2847 if (unicode == NULL) {
2848 PyErr_BadInternalCall();
2849 return NULL;
2850 }
2851
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002852 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002853 if (buflen == -1)
2854 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002855 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002856 PyErr_NoMemory();
2857 return NULL;
2858 }
2859
Victor Stinner137c34c2010-09-29 10:25:54 +00002860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2861 if (buffer == NULL) {
2862 PyErr_NoMemory();
2863 return NULL;
2864 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002865 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002866 if (buflen == -1) {
2867 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002869 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 if (size != NULL)
2871 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002872 return buffer;
2873}
2874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002875#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Alexander Belopolsky40018472011-02-26 01:02:56 +00002877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002881 void *data;
2882 int kind;
2883
Victor Stinner8faf8212011-12-08 22:14:11 +01002884 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 PyErr_SetString(PyExc_ValueError,
2886 "chr() arg not in range(0x110000)");
2887 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002888 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002889
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002890 if ((Py_UCS4)ordinal < 256)
2891 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 v = PyUnicode_New(1, ordinal);
2894 if (v == NULL)
2895 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002896 kind = PyUnicode_KIND(v);
2897 data = PyUnicode_DATA(v);
2898 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002899 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002901}
2902
Alexander Belopolsky40018472011-02-26 01:02:56 +00002903PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002904PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002906 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002908 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002909 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002910 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 Py_INCREF(obj);
2912 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002913 }
2914 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 /* For a Unicode subtype that's not a Unicode object,
2916 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002917 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002918 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002919 PyErr_Format(PyExc_TypeError,
2920 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002921 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002922 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002923}
2924
Alexander Belopolsky40018472011-02-26 01:02:56 +00002925PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002926PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002927 const char *encoding,
2928 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002929{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002930 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002931 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002932
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 PyErr_BadInternalCall();
2935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Decoding bytes objects is the most common case and should be fast */
2939 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002940 if (PyBytes_GET_SIZE(obj) == 0)
2941 _Py_RETURN_UNICODE_EMPTY();
2942 v = PyUnicode_Decode(
2943 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2944 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002945 return v;
2946 }
2947
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002948 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 PyErr_SetString(PyExc_TypeError,
2950 "decoding str is not supported");
2951 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002953
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002954 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2955 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2956 PyErr_Format(PyExc_TypeError,
2957 "coercing to str: need bytes, bytearray "
2958 "or buffer-like object, %.80s found",
2959 Py_TYPE(obj)->tp_name);
2960 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002961 }
Tim Petersced69f82003-09-16 20:30:58 +00002962
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002963 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002964 PyBuffer_Release(&buffer);
2965 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002967
Serhiy Storchaka05997252013-01-26 12:14:02 +02002968 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002969 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002970 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971}
2972
Victor Stinner600d3be2010-06-10 12:00:55 +00002973/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002974 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2975 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002976int
2977_Py_normalize_encoding(const char *encoding,
2978 char *lower,
2979 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002981 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002982 char *l;
2983 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002985 if (encoding == NULL) {
2986 strcpy(lower, "utf-8");
2987 return 1;
2988 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002989 e = encoding;
2990 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002991 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002992 while (*e) {
2993 if (l == l_end)
2994 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002995 if (Py_ISUPPER(*e)) {
2996 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002997 }
2998 else if (*e == '_') {
2999 *l++ = '-';
3000 e++;
3001 }
3002 else {
3003 *l++ = *e++;
3004 }
3005 }
3006 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003007 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003008}
3009
Alexander Belopolsky40018472011-02-26 01:02:56 +00003010PyObject *
3011PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003012 Py_ssize_t size,
3013 const char *encoding,
3014 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003015{
3016 PyObject *buffer = NULL, *unicode;
3017 Py_buffer info;
3018 char lower[11]; /* Enough for any encoding shortcut */
3019
Fred Drakee4315f52000-05-09 19:53:39 +00003020 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003021 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003022 if ((strcmp(lower, "utf-8") == 0) ||
3023 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003024 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003025 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003026 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003027 (strcmp(lower, "iso-8859-1") == 0) ||
3028 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003029 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003030#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003031 else if (strcmp(lower, "mbcs") == 0)
3032 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003033#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003034 else if (strcmp(lower, "ascii") == 0)
3035 return PyUnicode_DecodeASCII(s, size, errors);
3036 else if (strcmp(lower, "utf-16") == 0)
3037 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3038 else if (strcmp(lower, "utf-32") == 0)
3039 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041
3042 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003043 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003044 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003045 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003046 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 if (buffer == NULL)
3048 goto onError;
3049 unicode = PyCodec_Decode(buffer, encoding, errors);
3050 if (unicode == NULL)
3051 goto onError;
3052 if (!PyUnicode_Check(unicode)) {
3053 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003054 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003055 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 Py_DECREF(unicode);
3057 goto onError;
3058 }
3059 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003060 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003061
Benjamin Peterson29060642009-01-31 22:14:21 +00003062 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 Py_XDECREF(buffer);
3064 return NULL;
3065}
3066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
3068PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 const char *encoding,
3070 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003071{
3072 PyObject *v;
3073
3074 if (!PyUnicode_Check(unicode)) {
3075 PyErr_BadArgument();
3076 goto onError;
3077 }
3078
3079 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003081
3082 /* Decode via the codec registry */
3083 v = PyCodec_Decode(unicode, encoding, errors);
3084 if (v == NULL)
3085 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003086 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003087
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003089 return NULL;
3090}
3091
Alexander Belopolsky40018472011-02-26 01:02:56 +00003092PyObject *
3093PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003094 const char *encoding,
3095 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003096{
3097 PyObject *v;
3098
3099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
3101 goto onError;
3102 }
3103
3104 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106
3107 /* Decode via the codec registry */
3108 v = PyCodec_Decode(unicode, encoding, errors);
3109 if (v == NULL)
3110 goto onError;
3111 if (!PyUnicode_Check(v)) {
3112 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003113 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003114 Py_TYPE(v)->tp_name);
3115 Py_DECREF(v);
3116 goto onError;
3117 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003118 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003119
Benjamin Peterson29060642009-01-31 22:14:21 +00003120 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003121 return NULL;
3122}
3123
Alexander Belopolsky40018472011-02-26 01:02:56 +00003124PyObject *
3125PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003126 Py_ssize_t size,
3127 const char *encoding,
3128 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129{
3130 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003131
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 unicode = PyUnicode_FromUnicode(s, size);
3133 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3136 Py_DECREF(unicode);
3137 return v;
3138}
3139
Alexander Belopolsky40018472011-02-26 01:02:56 +00003140PyObject *
3141PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003142 const char *encoding,
3143 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003144{
3145 PyObject *v;
3146
3147 if (!PyUnicode_Check(unicode)) {
3148 PyErr_BadArgument();
3149 goto onError;
3150 }
3151
3152 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154
3155 /* Encode via the codec registry */
3156 v = PyCodec_Encode(unicode, encoding, errors);
3157 if (v == NULL)
3158 goto onError;
3159 return v;
3160
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003162 return NULL;
3163}
3164
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003165static size_t
3166wcstombs_errorpos(const wchar_t *wstr)
3167{
3168 size_t len;
3169#if SIZEOF_WCHAR_T == 2
3170 wchar_t buf[3];
3171#else
3172 wchar_t buf[2];
3173#endif
3174 char outbuf[MB_LEN_MAX];
3175 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003176
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003177#if SIZEOF_WCHAR_T == 2
3178 buf[2] = 0;
3179#else
3180 buf[1] = 0;
3181#endif
3182 start = wstr;
3183 while (*wstr != L'\0')
3184 {
3185 previous = wstr;
3186#if SIZEOF_WCHAR_T == 2
3187 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3188 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3189 {
3190 buf[0] = wstr[0];
3191 buf[1] = wstr[1];
3192 wstr += 2;
3193 }
3194 else {
3195 buf[0] = *wstr;
3196 buf[1] = 0;
3197 wstr++;
3198 }
3199#else
3200 buf[0] = *wstr;
3201 wstr++;
3202#endif
3203 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003204 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003205 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003206 }
3207
3208 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003209 return 0;
3210}
3211
Victor Stinner1b579672011-12-17 05:47:23 +01003212static int
3213locale_error_handler(const char *errors, int *surrogateescape)
3214{
3215 if (errors == NULL) {
3216 *surrogateescape = 0;
3217 return 0;
3218 }
3219
3220 if (strcmp(errors, "strict") == 0) {
3221 *surrogateescape = 0;
3222 return 0;
3223 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003224 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003225 *surrogateescape = 1;
3226 return 0;
3227 }
3228 PyErr_Format(PyExc_ValueError,
3229 "only 'strict' and 'surrogateescape' error handlers "
3230 "are supported, not '%s'",
3231 errors);
3232 return -1;
3233}
3234
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003236PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003237{
3238 Py_ssize_t wlen, wlen2;
3239 wchar_t *wstr;
3240 PyObject *bytes = NULL;
3241 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003242 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003243 PyObject *exc;
3244 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003245 int surrogateescape;
3246
3247 if (locale_error_handler(errors, &surrogateescape) < 0)
3248 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003249
3250 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3251 if (wstr == NULL)
3252 return NULL;
3253
3254 wlen2 = wcslen(wstr);
3255 if (wlen2 != wlen) {
3256 PyMem_Free(wstr);
3257 PyErr_SetString(PyExc_TypeError, "embedded null character");
3258 return NULL;
3259 }
3260
3261 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003262 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003263 char *str;
3264
3265 str = _Py_wchar2char(wstr, &error_pos);
3266 if (str == NULL) {
3267 if (error_pos == (size_t)-1) {
3268 PyErr_NoMemory();
3269 PyMem_Free(wstr);
3270 return NULL;
3271 }
3272 else {
3273 goto encode_error;
3274 }
3275 }
3276 PyMem_Free(wstr);
3277
3278 bytes = PyBytes_FromString(str);
3279 PyMem_Free(str);
3280 }
3281 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003282 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003283 size_t len, len2;
3284
3285 len = wcstombs(NULL, wstr, 0);
3286 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003287 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003288 goto encode_error;
3289 }
3290
3291 bytes = PyBytes_FromStringAndSize(NULL, len);
3292 if (bytes == NULL) {
3293 PyMem_Free(wstr);
3294 return NULL;
3295 }
3296
3297 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3298 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003299 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003300 goto encode_error;
3301 }
3302 PyMem_Free(wstr);
3303 }
3304 return bytes;
3305
3306encode_error:
3307 errmsg = strerror(errno);
3308 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003309
3310 if (error_pos == (size_t)-1)
3311 error_pos = wcstombs_errorpos(wstr);
3312
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003313 PyMem_Free(wstr);
3314 Py_XDECREF(bytes);
3315
Victor Stinner2f197072011-12-17 07:08:30 +01003316 if (errmsg != NULL) {
3317 size_t errlen;
3318 wstr = _Py_char2wchar(errmsg, &errlen);
3319 if (wstr != NULL) {
3320 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003321 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003322 } else
3323 errmsg = NULL;
3324 }
3325 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003326 reason = PyUnicode_FromString(
3327 "wcstombs() encountered an unencodable "
3328 "wide character");
3329 if (reason == NULL)
3330 return NULL;
3331
3332 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3333 "locale", unicode,
3334 (Py_ssize_t)error_pos,
3335 (Py_ssize_t)(error_pos+1),
3336 reason);
3337 Py_DECREF(reason);
3338 if (exc != NULL) {
3339 PyCodec_StrictErrors(exc);
3340 Py_XDECREF(exc);
3341 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342 return NULL;
3343}
3344
Victor Stinnerad158722010-10-27 00:25:46 +00003345PyObject *
3346PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003347{
Victor Stinner99b95382011-07-04 14:23:54 +02003348#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003349 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003350#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003352#else
Victor Stinner793b5312011-04-27 00:24:21 +02003353 PyInterpreterState *interp = PyThreadState_GET()->interp;
3354 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3355 cannot use it to encode and decode filenames before it is loaded. Load
3356 the Python codec requires to encode at least its own filename. Use the C
3357 version of the locale codec until the codec registry is initialized and
3358 the Python codec is loaded.
3359
3360 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3361 cannot only rely on it: check also interp->fscodec_initialized for
3362 subinterpreters. */
3363 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003364 return PyUnicode_AsEncodedString(unicode,
3365 Py_FileSystemDefaultEncoding,
3366 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003367 }
3368 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003369 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003370 }
Victor Stinnerad158722010-10-27 00:25:46 +00003371#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003372}
3373
Alexander Belopolsky40018472011-02-26 01:02:56 +00003374PyObject *
3375PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003376 const char *encoding,
3377 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378{
3379 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003380 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003381
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 if (!PyUnicode_Check(unicode)) {
3383 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 }
Fred Drakee4315f52000-05-09 19:53:39 +00003386
Fred Drakee4315f52000-05-09 19:53:39 +00003387 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003388 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003389 if ((strcmp(lower, "utf-8") == 0) ||
3390 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003391 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003392 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003394 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003396 }
Victor Stinner37296e82010-06-10 13:36:23 +00003397 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003398 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003399 (strcmp(lower, "iso-8859-1") == 0) ||
3400 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003401 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003402#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003403 else if (strcmp(lower, "mbcs") == 0)
3404 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003405#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003406 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003407 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409
3410 /* Encode via the codec registry */
3411 v = PyCodec_Encode(unicode, encoding, errors);
3412 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003413 return NULL;
3414
3415 /* The normal path */
3416 if (PyBytes_Check(v))
3417 return v;
3418
3419 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003420 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003421 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003422 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003423
3424 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3425 "encoder %s returned bytearray instead of bytes",
3426 encoding);
3427 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003428 Py_DECREF(v);
3429 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003430 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003431
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003432 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3433 Py_DECREF(v);
3434 return b;
3435 }
3436
3437 PyErr_Format(PyExc_TypeError,
3438 "encoder did not return a bytes object (type=%.400s)",
3439 Py_TYPE(v)->tp_name);
3440 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003441 return NULL;
3442}
3443
Alexander Belopolsky40018472011-02-26 01:02:56 +00003444PyObject *
3445PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003446 const char *encoding,
3447 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003448{
3449 PyObject *v;
3450
3451 if (!PyUnicode_Check(unicode)) {
3452 PyErr_BadArgument();
3453 goto onError;
3454 }
3455
3456 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003457 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003458
3459 /* Encode via the codec registry */
3460 v = PyCodec_Encode(unicode, encoding, errors);
3461 if (v == NULL)
3462 goto onError;
3463 if (!PyUnicode_Check(v)) {
3464 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003465 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003466 Py_TYPE(v)->tp_name);
3467 Py_DECREF(v);
3468 goto onError;
3469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003471
Benjamin Peterson29060642009-01-31 22:14:21 +00003472 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473 return NULL;
3474}
3475
Victor Stinner2f197072011-12-17 07:08:30 +01003476static size_t
3477mbstowcs_errorpos(const char *str, size_t len)
3478{
3479#ifdef HAVE_MBRTOWC
3480 const char *start = str;
3481 mbstate_t mbs;
3482 size_t converted;
3483 wchar_t ch;
3484
3485 memset(&mbs, 0, sizeof mbs);
3486 while (len)
3487 {
3488 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3489 if (converted == 0)
3490 /* Reached end of string */
3491 break;
3492 if (converted == (size_t)-1 || converted == (size_t)-2) {
3493 /* Conversion error or incomplete character */
3494 return str - start;
3495 }
3496 else {
3497 str += converted;
3498 len -= converted;
3499 }
3500 }
3501 /* failed to find the undecodable byte sequence */
3502 return 0;
3503#endif
3504 return 0;
3505}
3506
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003507PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003508PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003509 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003510{
3511 wchar_t smallbuf[256];
3512 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3513 wchar_t *wstr;
3514 size_t wlen, wlen2;
3515 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003516 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003517 size_t error_pos;
3518 char *errmsg;
3519 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003520
3521 if (locale_error_handler(errors, &surrogateescape) < 0)
3522 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003523
3524 if (str[len] != '\0' || len != strlen(str)) {
3525 PyErr_SetString(PyExc_TypeError, "embedded null character");
3526 return NULL;
3527 }
3528
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003529 if (surrogateescape) {
3530 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531 wstr = _Py_char2wchar(str, &wlen);
3532 if (wstr == NULL) {
3533 if (wlen == (size_t)-1)
3534 PyErr_NoMemory();
3535 else
3536 PyErr_SetFromErrno(PyExc_OSError);
3537 return NULL;
3538 }
3539
3540 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003541 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003542 }
3543 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003544 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003545#ifndef HAVE_BROKEN_MBSTOWCS
3546 wlen = mbstowcs(NULL, str, 0);
3547#else
3548 wlen = len;
3549#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003550 if (wlen == (size_t)-1)
3551 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003552 if (wlen+1 <= smallbuf_len) {
3553 wstr = smallbuf;
3554 }
3555 else {
3556 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3557 return PyErr_NoMemory();
3558
3559 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3560 if (!wstr)
3561 return PyErr_NoMemory();
3562 }
3563
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003564 wlen2 = mbstowcs(wstr, str, wlen+1);
3565 if (wlen2 == (size_t)-1) {
3566 if (wstr != smallbuf)
3567 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003568 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003569 }
3570#ifdef HAVE_BROKEN_MBSTOWCS
3571 assert(wlen2 == wlen);
3572#endif
3573 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3574 if (wstr != smallbuf)
3575 PyMem_Free(wstr);
3576 }
3577 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003578
3579decode_error:
3580 errmsg = strerror(errno);
3581 assert(errmsg != NULL);
3582
3583 error_pos = mbstowcs_errorpos(str, len);
3584 if (errmsg != NULL) {
3585 size_t errlen;
3586 wstr = _Py_char2wchar(errmsg, &errlen);
3587 if (wstr != NULL) {
3588 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003589 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003590 } else
3591 errmsg = NULL;
3592 }
3593 if (errmsg == NULL)
3594 reason = PyUnicode_FromString(
3595 "mbstowcs() encountered an invalid multibyte sequence");
3596 if (reason == NULL)
3597 return NULL;
3598
3599 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3600 "locale", str, len,
3601 (Py_ssize_t)error_pos,
3602 (Py_ssize_t)(error_pos+1),
3603 reason);
3604 Py_DECREF(reason);
3605 if (exc != NULL) {
3606 PyCodec_StrictErrors(exc);
3607 Py_XDECREF(exc);
3608 }
3609 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003610}
3611
3612PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003613PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003614{
3615 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003616 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003617}
3618
3619
3620PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003621PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003622 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003623 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3624}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003625
Christian Heimes5894ba72007-11-04 11:43:14 +00003626PyObject*
3627PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3628{
Victor Stinner99b95382011-07-04 14:23:54 +02003629#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003630 return PyUnicode_DecodeMBCS(s, size, NULL);
3631#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003632 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003633#else
Victor Stinner793b5312011-04-27 00:24:21 +02003634 PyInterpreterState *interp = PyThreadState_GET()->interp;
3635 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3636 cannot use it to encode and decode filenames before it is loaded. Load
3637 the Python codec requires to encode at least its own filename. Use the C
3638 version of the locale codec until the codec registry is initialized and
3639 the Python codec is loaded.
3640
3641 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3642 cannot only rely on it: check also interp->fscodec_initialized for
3643 subinterpreters. */
3644 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003645 return PyUnicode_Decode(s, size,
3646 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003647 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003648 }
3649 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003650 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003651 }
Victor Stinnerad158722010-10-27 00:25:46 +00003652#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003653}
3654
Martin v. Löwis011e8422009-05-05 04:43:17 +00003655
3656int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003657_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003658{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003659 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003660
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003661 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003662 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003663 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3664 PyUnicode_GET_LENGTH(str), '\0', 1);
3665 if (pos == -1)
3666 return 0;
3667 else
3668 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003669}
3670
Antoine Pitrou13348842012-01-29 18:36:34 +01003671int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003672PyUnicode_FSConverter(PyObject* arg, void* addr)
3673{
3674 PyObject *output = NULL;
3675 Py_ssize_t size;
3676 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003677 if (arg == NULL) {
3678 Py_DECREF(*(PyObject**)addr);
3679 return 1;
3680 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003681 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003682 output = arg;
3683 Py_INCREF(output);
3684 }
3685 else {
3686 arg = PyUnicode_FromObject(arg);
3687 if (!arg)
3688 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003689 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003690 Py_DECREF(arg);
3691 if (!output)
3692 return 0;
3693 if (!PyBytes_Check(output)) {
3694 Py_DECREF(output);
3695 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3696 return 0;
3697 }
3698 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003699 size = PyBytes_GET_SIZE(output);
3700 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003701 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003702 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003703 Py_DECREF(output);
3704 return 0;
3705 }
3706 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003707 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003708}
3709
3710
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003711int
3712PyUnicode_FSDecoder(PyObject* arg, void* addr)
3713{
3714 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003715 if (arg == NULL) {
3716 Py_DECREF(*(PyObject**)addr);
3717 return 1;
3718 }
3719 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003720 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003722 output = arg;
3723 Py_INCREF(output);
3724 }
3725 else {
3726 arg = PyBytes_FromObject(arg);
3727 if (!arg)
3728 return 0;
3729 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3730 PyBytes_GET_SIZE(arg));
3731 Py_DECREF(arg);
3732 if (!output)
3733 return 0;
3734 if (!PyUnicode_Check(output)) {
3735 Py_DECREF(output);
3736 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3737 return 0;
3738 }
3739 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003740 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003741 Py_DECREF(output);
3742 return 0;
3743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003745 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003746 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3747 Py_DECREF(output);
3748 return 0;
3749 }
3750 *(PyObject**)addr = output;
3751 return Py_CLEANUP_SUPPORTED;
3752}
3753
3754
Martin v. Löwis5b222132007-06-10 09:51:05 +00003755char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003757{
Christian Heimesf3863112007-11-22 07:46:41 +00003758 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003760 if (!PyUnicode_Check(unicode)) {
3761 PyErr_BadArgument();
3762 return NULL;
3763 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003764 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003765 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003767 if (PyUnicode_UTF8(unicode) == NULL) {
3768 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3770 if (bytes == NULL)
3771 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3773 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003774 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775 Py_DECREF(bytes);
3776 return NULL;
3777 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003778 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3779 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3780 PyBytes_AS_STRING(bytes),
3781 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 Py_DECREF(bytes);
3783 }
3784
3785 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003786 *psize = PyUnicode_UTF8_LENGTH(unicode);
3787 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003788}
3789
3790char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3794}
3795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796Py_UNICODE *
3797PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799 const unsigned char *one_byte;
3800#if SIZEOF_WCHAR_T == 4
3801 const Py_UCS2 *two_bytes;
3802#else
3803 const Py_UCS4 *four_bytes;
3804 const Py_UCS4 *ucs4_end;
3805 Py_ssize_t num_surrogates;
3806#endif
3807 wchar_t *w;
3808 wchar_t *wchar_end;
3809
3810 if (!PyUnicode_Check(unicode)) {
3811 PyErr_BadArgument();
3812 return NULL;
3813 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003814 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003816 assert(_PyUnicode_KIND(unicode) != 0);
3817 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003819 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003821 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3822 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823 num_surrogates = 0;
3824
3825 for (; four_bytes < ucs4_end; ++four_bytes) {
3826 if (*four_bytes > 0xFFFF)
3827 ++num_surrogates;
3828 }
3829
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3831 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3832 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833 PyErr_NoMemory();
3834 return NULL;
3835 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003836 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003838 w = _PyUnicode_WSTR(unicode);
3839 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3840 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3842 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003843 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003845 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3846 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 }
3848 else
3849 *w = *four_bytes;
3850
3851 if (w > wchar_end) {
3852 assert(0 && "Miscalculated string end");
3853 }
3854 }
3855 *w = 0;
3856#else
3857 /* sizeof(wchar_t) == 4 */
3858 Py_FatalError("Impossible unicode object state, wstr and str "
3859 "should share memory already.");
3860 return NULL;
3861#endif
3862 }
3863 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003864 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3865 (_PyUnicode_LENGTH(unicode) + 1));
3866 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 PyErr_NoMemory();
3868 return NULL;
3869 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003870 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3871 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3872 w = _PyUnicode_WSTR(unicode);
3873 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003875 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3876 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 for (; w < wchar_end; ++one_byte, ++w)
3878 *w = *one_byte;
3879 /* null-terminate the wstr */
3880 *w = 0;
3881 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003884 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 for (; w < wchar_end; ++two_bytes, ++w)
3886 *w = *two_bytes;
3887 /* null-terminate the wstr */
3888 *w = 0;
3889#else
3890 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 PyObject_FREE(_PyUnicode_WSTR(unicode));
3892 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 Py_FatalError("Impossible unicode object state, wstr "
3894 "and str should share memory already.");
3895 return NULL;
3896#endif
3897 }
3898 else {
3899 assert(0 && "This should never happen.");
3900 }
3901 }
3902 }
3903 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003904 *size = PyUnicode_WSTR_LENGTH(unicode);
3905 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003906}
3907
Alexander Belopolsky40018472011-02-26 01:02:56 +00003908Py_UNICODE *
3909PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912}
3913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914
Alexander Belopolsky40018472011-02-26 01:02:56 +00003915Py_ssize_t
3916PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917{
3918 if (!PyUnicode_Check(unicode)) {
3919 PyErr_BadArgument();
3920 goto onError;
3921 }
3922 return PyUnicode_GET_SIZE(unicode);
3923
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 return -1;
3926}
3927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928Py_ssize_t
3929PyUnicode_GetLength(PyObject *unicode)
3930{
Victor Stinner07621332012-06-16 04:53:46 +02003931 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003932 PyErr_BadArgument();
3933 return -1;
3934 }
Victor Stinner07621332012-06-16 04:53:46 +02003935 if (PyUnicode_READY(unicode) == -1)
3936 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937 return PyUnicode_GET_LENGTH(unicode);
3938}
3939
3940Py_UCS4
3941PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3942{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003943 void *data;
3944 int kind;
3945
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003946 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3947 PyErr_BadArgument();
3948 return (Py_UCS4)-1;
3949 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003950 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003951 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 return (Py_UCS4)-1;
3953 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003954 data = PyUnicode_DATA(unicode);
3955 kind = PyUnicode_KIND(unicode);
3956 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957}
3958
3959int
3960PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3961{
3962 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003963 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 return -1;
3965 }
Victor Stinner488fa492011-12-12 00:01:39 +01003966 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003967 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003968 PyErr_SetString(PyExc_IndexError, "string index out of range");
3969 return -1;
3970 }
Victor Stinner488fa492011-12-12 00:01:39 +01003971 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003972 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003973 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3974 PyErr_SetString(PyExc_ValueError, "character out of range");
3975 return -1;
3976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3978 index, ch);
3979 return 0;
3980}
3981
Alexander Belopolsky40018472011-02-26 01:02:56 +00003982const char *
3983PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003984{
Victor Stinner42cb4622010-09-01 19:39:01 +00003985 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003986}
3987
Victor Stinner554f3f02010-06-16 23:33:54 +00003988/* create or adjust a UnicodeDecodeError */
3989static void
3990make_decode_exception(PyObject **exceptionObject,
3991 const char *encoding,
3992 const char *input, Py_ssize_t length,
3993 Py_ssize_t startpos, Py_ssize_t endpos,
3994 const char *reason)
3995{
3996 if (*exceptionObject == NULL) {
3997 *exceptionObject = PyUnicodeDecodeError_Create(
3998 encoding, input, length, startpos, endpos, reason);
3999 }
4000 else {
4001 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4002 goto onError;
4003 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4004 goto onError;
4005 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4006 goto onError;
4007 }
4008 return;
4009
4010onError:
4011 Py_DECREF(*exceptionObject);
4012 *exceptionObject = NULL;
4013}
4014
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004015#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016/* error handling callback helper:
4017 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004018 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004019 and adjust various state variables.
4020 return 0 on success, -1 on error
4021*/
4022
Alexander Belopolsky40018472011-02-26 01:02:56 +00004023static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004024unicode_decode_call_errorhandler_wchar(
4025 const char *errors, PyObject **errorHandler,
4026 const char *encoding, const char *reason,
4027 const char **input, const char **inend, Py_ssize_t *startinpos,
4028 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4029 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004031 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032
4033 PyObject *restuple = NULL;
4034 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004035 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004036 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004037 Py_ssize_t requiredsize;
4038 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004039 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004040 wchar_t *repwstr;
4041 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004043 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4044 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004045
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 *errorHandler = PyCodec_LookupError(errors);
4048 if (*errorHandler == NULL)
4049 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 }
4051
Victor Stinner554f3f02010-06-16 23:33:54 +00004052 make_decode_exception(exceptionObject,
4053 encoding,
4054 *input, *inend - *input,
4055 *startinpos, *endinpos,
4056 reason);
4057 if (*exceptionObject == NULL)
4058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059
4060 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4061 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004064 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 }
4067 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004068 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004069
4070 /* Copy back the bytes variables, which might have been modified by the
4071 callback */
4072 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4073 if (!inputobj)
4074 goto onError;
4075 if (!PyBytes_Check(inputobj)) {
4076 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4077 }
4078 *input = PyBytes_AS_STRING(inputobj);
4079 insize = PyBytes_GET_SIZE(inputobj);
4080 *inend = *input + insize;
4081 /* we can DECREF safely, as the exception has another reference,
4082 so the object won't go away. */
4083 Py_DECREF(inputobj);
4084
4085 if (newpos<0)
4086 newpos = insize+newpos;
4087 if (newpos<0 || newpos>insize) {
4088 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4089 goto onError;
4090 }
4091
4092 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4093 if (repwstr == NULL)
4094 goto onError;
4095 /* need more space? (at least enough for what we
4096 have+the replacement+the rest of the string (starting
4097 at the new input position), so we won't have to check space
4098 when there are no errors in the rest of the string) */
4099 requiredsize = *outpos + repwlen + insize-newpos;
4100 if (requiredsize > outsize) {
4101 if (requiredsize < 2*outsize)
4102 requiredsize = 2*outsize;
4103 if (unicode_resize(output, requiredsize) < 0)
4104 goto onError;
4105 }
4106 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4107 *outpos += repwlen;
4108
4109 *endinpos = newpos;
4110 *inptr = *input + newpos;
4111
4112 /* we made it! */
4113 Py_XDECREF(restuple);
4114 return 0;
4115
4116 onError:
4117 Py_XDECREF(restuple);
4118 return -1;
4119}
4120#endif /* HAVE_MBCS */
4121
4122static int
4123unicode_decode_call_errorhandler_writer(
4124 const char *errors, PyObject **errorHandler,
4125 const char *encoding, const char *reason,
4126 const char **input, const char **inend, Py_ssize_t *startinpos,
4127 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4128 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4129{
4130 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4131
4132 PyObject *restuple = NULL;
4133 PyObject *repunicode = NULL;
4134 Py_ssize_t insize;
4135 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004136 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004137 PyObject *inputobj = NULL;
4138
4139 if (*errorHandler == NULL) {
4140 *errorHandler = PyCodec_LookupError(errors);
4141 if (*errorHandler == NULL)
4142 goto onError;
4143 }
4144
4145 make_decode_exception(exceptionObject,
4146 encoding,
4147 *input, *inend - *input,
4148 *startinpos, *endinpos,
4149 reason);
4150 if (*exceptionObject == NULL)
4151 goto onError;
4152
4153 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4154 if (restuple == NULL)
4155 goto onError;
4156 if (!PyTuple_Check(restuple)) {
4157 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4158 goto onError;
4159 }
4160 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004161 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004162
4163 /* Copy back the bytes variables, which might have been modified by the
4164 callback */
4165 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4166 if (!inputobj)
4167 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004168 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004170 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004171 *input = PyBytes_AS_STRING(inputobj);
4172 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004173 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004174 /* we can DECREF safely, as the exception has another reference,
4175 so the object won't go away. */
4176 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004177
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004180 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4182 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004183 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184
Victor Stinner8f674cc2013-04-17 23:02:17 +02004185 if (PyUnicode_READY(repunicode) < 0)
4186 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004187 replen = PyUnicode_GET_LENGTH(repunicode);
4188 writer->min_length += replen;
4189 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004190 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004191 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004192 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004195 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004196
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198 Py_XDECREF(restuple);
4199 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004203 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204}
4205
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004206/* --- UTF-7 Codec -------------------------------------------------------- */
4207
Antoine Pitrou244651a2009-05-04 18:56:13 +00004208/* See RFC2152 for details. We encode conservatively and decode liberally. */
4209
4210/* Three simple macros defining base-64. */
4211
4212/* Is c a base-64 character? */
4213
4214#define IS_BASE64(c) \
4215 (((c) >= 'A' && (c) <= 'Z') || \
4216 ((c) >= 'a' && (c) <= 'z') || \
4217 ((c) >= '0' && (c) <= '9') || \
4218 (c) == '+' || (c) == '/')
4219
4220/* given that c is a base-64 character, what is its base-64 value? */
4221
4222#define FROM_BASE64(c) \
4223 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4224 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4225 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4226 (c) == '+' ? 62 : 63)
4227
4228/* What is the base-64 character of the bottom 6 bits of n? */
4229
4230#define TO_BASE64(n) \
4231 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4232
4233/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4234 * decoded as itself. We are permissive on decoding; the only ASCII
4235 * byte not decoding to itself is the + which begins a base64
4236 * string. */
4237
4238#define DECODE_DIRECT(c) \
4239 ((c) <= 127 && (c) != '+')
4240
4241/* The UTF-7 encoder treats ASCII characters differently according to
4242 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4243 * the above). See RFC2152. This array identifies these different
4244 * sets:
4245 * 0 : "Set D"
4246 * alphanumeric and '(),-./:?
4247 * 1 : "Set O"
4248 * !"#$%&*;<=>@[]^_`{|}
4249 * 2 : "whitespace"
4250 * ht nl cr sp
4251 * 3 : special (must be base64 encoded)
4252 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4253 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004254
Tim Petersced69f82003-09-16 20:30:58 +00004255static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004256char utf7_category[128] = {
4257/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4258 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4259/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4260 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4261/* sp ! " # $ % & ' ( ) * + , - . / */
4262 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4263/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4265/* @ A B C D E F G H I J K L M N O */
4266 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4267/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4269/* ` a b c d e f g h i j k l m n o */
4270 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4271/* p q r s t u v w x y z { | } ~ del */
4272 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004273};
4274
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275/* ENCODE_DIRECT: this character should be encoded as itself. The
4276 * answer depends on whether we are encoding set O as itself, and also
4277 * on whether we are encoding whitespace as itself. RFC2152 makes it
4278 * clear that the answers to these questions vary between
4279 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004280
Antoine Pitrou244651a2009-05-04 18:56:13 +00004281#define ENCODE_DIRECT(c, directO, directWS) \
4282 ((c) < 128 && (c) > 0 && \
4283 ((utf7_category[(c)] == 0) || \
4284 (directWS && (utf7_category[(c)] == 2)) || \
4285 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286
Alexander Belopolsky40018472011-02-26 01:02:56 +00004287PyObject *
4288PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004289 Py_ssize_t size,
4290 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004291{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004292 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4293}
4294
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295/* The decoder. The only state we preserve is our read position,
4296 * i.e. how many characters we have consumed. So if we end in the
4297 * middle of a shift sequence we have to back off the read position
4298 * and the output to the beginning of the sequence, otherwise we lose
4299 * all the shift state (seen bits, number of bits seen, high
4300 * surrogate). */
4301
Alexander Belopolsky40018472011-02-26 01:02:56 +00004302PyObject *
4303PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004304 Py_ssize_t size,
4305 const char *errors,
4306 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004307{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004309 Py_ssize_t startinpos;
4310 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004311 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004312 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 const char *errmsg = "";
4314 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004315 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 unsigned int base64bits = 0;
4317 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004318 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 PyObject *errorHandler = NULL;
4320 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004322 if (size == 0) {
4323 if (consumed)
4324 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004325 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004326 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004329 _PyUnicodeWriter_Init(&writer);
4330 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331
4332 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333 e = s + size;
4334
4335 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004336 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004338 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004339
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340 if (inShift) { /* in a base-64 section */
4341 if (IS_BASE64(ch)) { /* consume a base-64 character */
4342 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4343 base64bits += 6;
4344 s++;
4345 if (base64bits >= 16) {
4346 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004347 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 base64bits -= 16;
4349 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004350 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 if (surrogate) {
4352 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004353 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4354 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004355 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004356 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004358 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
4360 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004361 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004362 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 }
4365 }
Victor Stinner551ac952011-11-29 22:58:13 +01004366 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 /* first surrogate */
4368 surrogate = outCh;
4369 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004371 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004372 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 }
4374 }
4375 }
4376 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 inShift = 0;
4378 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004380 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004381 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004382 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 if (base64bits > 0) { /* left-over bits */
4385 if (base64bits >= 6) {
4386 /* We've seen at least one base-64 character */
4387 errmsg = "partial character in shift sequence";
4388 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 else {
4391 /* Some bits remain; they should be zero */
4392 if (base64buffer != 0) {
4393 errmsg = "non-zero padding bits in shift sequence";
4394 goto utf7Error;
4395 }
4396 }
4397 }
4398 if (ch != '-') {
4399 /* '-' is absorbed; other terminating
4400 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004401 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004402 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 }
4405 }
4406 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 s++; /* consume '+' */
4409 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004411 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004412 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 }
4414 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004418 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 }
4420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004423 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 else {
4427 startinpos = s-starts;
4428 s++;
4429 errmsg = "unexpected special character";
4430 goto utf7Error;
4431 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004435 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 errors, &errorHandler,
4437 "utf7", errmsg,
4438 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004439 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441 }
4442
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443 /* end of string */
4444
4445 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4446 /* if we're in an inconsistent state, that's an error */
4447 if (surrogate ||
4448 (base64bits >= 6) ||
4449 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004451 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 errors, &errorHandler,
4453 "utf7", "unterminated shift sequence",
4454 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004455 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 goto onError;
4457 if (s < e)
4458 goto restart;
4459 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461
4462 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004463 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004465 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004466 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 }
4468 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004469 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 Py_XDECREF(errorHandler);
4474 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004475 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 Py_XDECREF(errorHandler);
4479 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004480 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481 return NULL;
4482}
4483
4484
Alexander Belopolsky40018472011-02-26 01:02:56 +00004485PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004486_PyUnicode_EncodeUTF7(PyObject *str,
4487 int base64SetO,
4488 int base64WhiteSpace,
4489 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004491 int kind;
4492 void *data;
4493 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004494 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004495 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004496 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 unsigned int base64bits = 0;
4498 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499 char * out;
4500 char * start;
4501
Benjamin Petersonbac79492012-01-14 13:34:47 -05004502 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503 return NULL;
4504 kind = PyUnicode_KIND(str);
4505 data = PyUnicode_DATA(str);
4506 len = PyUnicode_GET_LENGTH(str);
4507
4508 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004511 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004512 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004513 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004514 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515 if (v == NULL)
4516 return NULL;
4517
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004518 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004519 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004520 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521
Antoine Pitrou244651a2009-05-04 18:56:13 +00004522 if (inShift) {
4523 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4524 /* shifting out */
4525 if (base64bits) { /* output remaining bits */
4526 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4527 base64buffer = 0;
4528 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004529 }
4530 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 /* Characters not in the BASE64 set implicitly unshift the sequence
4532 so no '-' is required, except if the character is itself a '-' */
4533 if (IS_BASE64(ch) || ch == '-') {
4534 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 *out++ = (char) ch;
4537 }
4538 else {
4539 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004540 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 else { /* not in a shift sequence */
4543 if (ch == '+') {
4544 *out++ = '+';
4545 *out++ = '-';
4546 }
4547 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4548 *out++ = (char) ch;
4549 }
4550 else {
4551 *out++ = '+';
4552 inShift = 1;
4553 goto encode_char;
4554 }
4555 }
4556 continue;
4557encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004559 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004560
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 /* code first surrogate */
4562 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004563 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 while (base64bits >= 6) {
4565 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4566 base64bits -= 6;
4567 }
4568 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004569 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 base64bits += 16;
4572 base64buffer = (base64buffer << 16) | ch;
4573 while (base64bits >= 6) {
4574 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4575 base64bits -= 6;
4576 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 if (base64bits)
4579 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4580 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004582 if (_PyBytes_Resize(&v, out - start) < 0)
4583 return NULL;
4584 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004586PyObject *
4587PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4588 Py_ssize_t size,
4589 int base64SetO,
4590 int base64WhiteSpace,
4591 const char *errors)
4592{
4593 PyObject *result;
4594 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4595 if (tmp == NULL)
4596 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004597 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004598 base64WhiteSpace, errors);
4599 Py_DECREF(tmp);
4600 return result;
4601}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603#undef IS_BASE64
4604#undef FROM_BASE64
4605#undef TO_BASE64
4606#undef DECODE_DIRECT
4607#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609/* --- UTF-8 Codec -------------------------------------------------------- */
4610
Alexander Belopolsky40018472011-02-26 01:02:56 +00004611PyObject *
4612PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004613 Py_ssize_t size,
4614 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615{
Walter Dörwald69652032004-09-07 20:24:22 +00004616 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4617}
4618
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004619#include "stringlib/asciilib.h"
4620#include "stringlib/codecs.h"
4621#include "stringlib/undef.h"
4622
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004623#include "stringlib/ucs1lib.h"
4624#include "stringlib/codecs.h"
4625#include "stringlib/undef.h"
4626
4627#include "stringlib/ucs2lib.h"
4628#include "stringlib/codecs.h"
4629#include "stringlib/undef.h"
4630
4631#include "stringlib/ucs4lib.h"
4632#include "stringlib/codecs.h"
4633#include "stringlib/undef.h"
4634
Antoine Pitrouab868312009-01-10 15:40:25 +00004635/* Mask to quickly check whether a C 'long' contains a
4636 non-ASCII, UTF8-encoded char. */
4637#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004638# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004639#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004640# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004641#else
4642# error C 'long' size should be either 4 or 8!
4643#endif
4644
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004645static Py_ssize_t
4646ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004647{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004649 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004651 /*
4652 * Issue #17237: m68k is a bit different from most architectures in
4653 * that objects do not use "natural alignment" - for example, int and
4654 * long are only aligned at 2-byte boundaries. Therefore the assert()
4655 * won't work; also, tests have shown that skipping the "optimised
4656 * version" will even speed up m68k.
4657 */
4658#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004659#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004660 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4661 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662 /* Fast path, see in STRINGLIB(utf8_decode) for
4663 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004664 /* Help allocation */
4665 const char *_p = p;
4666 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 while (_p < aligned_end) {
4668 unsigned long value = *(const unsigned long *) _p;
4669 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671 *((unsigned long *)q) = value;
4672 _p += SIZEOF_LONG;
4673 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004674 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004675 p = _p;
4676 while (p < end) {
4677 if ((unsigned char)*p & 0x80)
4678 break;
4679 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004684#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685 while (p < end) {
4686 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4687 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004688 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004689 /* Help allocation */
4690 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004691 while (_p < aligned_end) {
4692 unsigned long value = *(unsigned long *) _p;
4693 if (value & ASCII_CHAR_MASK)
4694 break;
4695 _p += SIZEOF_LONG;
4696 }
4697 p = _p;
4698 if (_p == end)
4699 break;
4700 }
4701 if ((unsigned char)*p & 0x80)
4702 break;
4703 ++p;
4704 }
4705 memcpy(dest, start, p - start);
4706 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707}
Antoine Pitrouab868312009-01-10 15:40:25 +00004708
Victor Stinner785938e2011-12-11 20:09:03 +01004709PyObject *
4710PyUnicode_DecodeUTF8Stateful(const char *s,
4711 Py_ssize_t size,
4712 const char *errors,
4713 Py_ssize_t *consumed)
4714{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004715 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004716 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004717 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718
4719 Py_ssize_t startinpos;
4720 Py_ssize_t endinpos;
4721 const char *errmsg = "";
4722 PyObject *errorHandler = NULL;
4723 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004724
4725 if (size == 0) {
4726 if (consumed)
4727 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004728 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004729 }
4730
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004731 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4732 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004733 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 *consumed = 1;
4735 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004736 }
4737
Victor Stinner8f674cc2013-04-17 23:02:17 +02004738 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004739 writer.min_length = size;
4740 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004741 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004742
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004743 writer.pos = ascii_decode(s, end, writer.data);
4744 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 while (s < end) {
4746 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004747 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004749 if (PyUnicode_IS_ASCII(writer.buffer))
4750 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004752 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004754 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004755 } else {
4756 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004757 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 }
4759
4760 switch (ch) {
4761 case 0:
4762 if (s == end || consumed)
4763 goto End;
4764 errmsg = "unexpected end of data";
4765 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004766 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004767 break;
4768 case 1:
4769 errmsg = "invalid start byte";
4770 startinpos = s - starts;
4771 endinpos = startinpos + 1;
4772 break;
4773 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004774 case 3:
4775 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776 errmsg = "invalid continuation byte";
4777 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004778 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 break;
4780 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004781 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 goto onError;
4783 continue;
4784 }
4785
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004786 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787 errors, &errorHandler,
4788 "utf-8", errmsg,
4789 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004790 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004791 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004792 }
4793
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004795 if (consumed)
4796 *consumed = s - starts;
4797
4798 Py_XDECREF(errorHandler);
4799 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004800 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004801
4802onError:
4803 Py_XDECREF(errorHandler);
4804 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004805 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004806 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004807}
4808
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004809#ifdef __APPLE__
4810
4811/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004812 used to decode the command line arguments on Mac OS X.
4813
4814 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004815 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004816
4817wchar_t*
4818_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4819{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004820 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821 wchar_t *unicode;
4822 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823
4824 /* Note: size will always be longer than the resulting Unicode
4825 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004826 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004827 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004828 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829 if (!unicode)
4830 return NULL;
4831
4832 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004836 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004837#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004839#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004840 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004842 if (ch > 0xFF) {
4843#if SIZEOF_WCHAR_T == 4
4844 assert(0);
4845#else
4846 assert(Py_UNICODE_IS_SURROGATE(ch));
4847 /* compute and append the two surrogates: */
4848 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4849 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4850#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004851 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 else {
4853 if (!ch && s == e)
4854 break;
4855 /* surrogateescape */
4856 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4857 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004858 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004860 return unicode;
4861}
4862
4863#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004865/* Primary internal function which creates utf8 encoded bytes objects.
4866
4867 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004868 and allocate exactly as much space needed at the end. Else allocate the
4869 maximum possible needed (4 result bytes per Unicode character), and return
4870 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004871*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004872PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004873_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Victor Stinner6099a032011-12-18 14:22:26 +01004875 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876 void *data;
4877 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879 if (!PyUnicode_Check(unicode)) {
4880 PyErr_BadArgument();
4881 return NULL;
4882 }
4883
4884 if (PyUnicode_READY(unicode) == -1)
4885 return NULL;
4886
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004887 if (PyUnicode_UTF8(unicode))
4888 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4889 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890
4891 kind = PyUnicode_KIND(unicode);
4892 data = PyUnicode_DATA(unicode);
4893 size = PyUnicode_GET_LENGTH(unicode);
4894
Benjamin Petersonead6b532011-12-20 17:23:42 -06004895 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004896 default:
4897 assert(0);
4898 case PyUnicode_1BYTE_KIND:
4899 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4900 assert(!PyUnicode_IS_ASCII(unicode));
4901 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4902 case PyUnicode_2BYTE_KIND:
4903 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4904 case PyUnicode_4BYTE_KIND:
4905 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907}
4908
Alexander Belopolsky40018472011-02-26 01:02:56 +00004909PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004910PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4911 Py_ssize_t size,
4912 const char *errors)
4913{
4914 PyObject *v, *unicode;
4915
4916 unicode = PyUnicode_FromUnicode(s, size);
4917 if (unicode == NULL)
4918 return NULL;
4919 v = _PyUnicode_AsUTF8String(unicode, errors);
4920 Py_DECREF(unicode);
4921 return v;
4922}
4923
4924PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004925PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004927 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928}
4929
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930/* --- UTF-32 Codec ------------------------------------------------------- */
4931
4932PyObject *
4933PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 Py_ssize_t size,
4935 const char *errors,
4936 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937{
4938 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4939}
4940
4941PyObject *
4942PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 Py_ssize_t size,
4944 const char *errors,
4945 int *byteorder,
4946 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947{
4948 const char *starts = s;
4949 Py_ssize_t startinpos;
4950 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004951 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004952 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004953 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955 PyObject *errorHandler = NULL;
4956 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004957
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 q = (unsigned char *)s;
4959 e = q + size;
4960
4961 if (byteorder)
4962 bo = *byteorder;
4963
4964 /* Check for BOM marks (U+FEFF) in the input and adjust current
4965 byte order setting accordingly. In native mode, the leading BOM
4966 mark is skipped, in all other modes, it is copied to the output
4967 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004968 if (bo == 0 && size >= 4) {
4969 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4970 if (bom == 0x0000FEFF) {
4971 bo = -1;
4972 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004974 else if (bom == 0xFFFE0000) {
4975 bo = 1;
4976 q += 4;
4977 }
4978 if (byteorder)
4979 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980 }
4981
Victor Stinnere64322e2012-10-30 23:12:47 +01004982 if (q == e) {
4983 if (consumed)
4984 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004985 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986 }
4987
Victor Stinnere64322e2012-10-30 23:12:47 +01004988#ifdef WORDS_BIGENDIAN
4989 le = bo < 0;
4990#else
4991 le = bo <= 0;
4992#endif
4993
Victor Stinner8f674cc2013-04-17 23:02:17 +02004994 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004995 writer.min_length = (e - q + 3) / 4;
4996 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004998
Victor Stinnere64322e2012-10-30 23:12:47 +01004999 while (1) {
5000 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005001 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005002
Victor Stinnere64322e2012-10-30 23:12:47 +01005003 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005004 enum PyUnicode_Kind kind = writer.kind;
5005 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005006 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005007 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 if (le) {
5009 do {
5010 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5011 if (ch > maxch)
5012 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005013 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005014 q += 4;
5015 } while (q <= last);
5016 }
5017 else {
5018 do {
5019 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5020 if (ch > maxch)
5021 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005022 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005023 q += 4;
5024 } while (q <= last);
5025 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005026 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005027 }
5028
5029 if (ch <= maxch) {
5030 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005032 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 startinpos = ((const char *)q) - starts;
5035 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005037 else {
5038 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005039 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005040 goto onError;
5041 q += 4;
5042 continue;
5043 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005045 startinpos = ((const char *)q) - starts;
5046 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005048
5049 /* The remaining input chars are ignored if the callback
5050 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005051 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 errors, &errorHandler,
5053 "utf32", errmsg,
5054 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005055 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 }
5058
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062 Py_XDECREF(errorHandler);
5063 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005064 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005067 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068 Py_XDECREF(errorHandler);
5069 Py_XDECREF(exc);
5070 return NULL;
5071}
5072
5073PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005074_PyUnicode_EncodeUTF32(PyObject *str,
5075 const char *errors,
5076 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005078 int kind;
5079 void *data;
5080 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005081 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005083 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005085#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086 int iorder[] = {0, 1, 2, 3};
5087#else
5088 int iorder[] = {3, 2, 1, 0};
5089#endif
5090
Benjamin Peterson29060642009-01-31 22:14:21 +00005091#define STORECHAR(CH) \
5092 do { \
5093 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5094 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5095 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5096 p[iorder[0]] = (CH) & 0xff; \
5097 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 } while(0)
5099
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005100 if (!PyUnicode_Check(str)) {
5101 PyErr_BadArgument();
5102 return NULL;
5103 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005104 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005105 return NULL;
5106 kind = PyUnicode_KIND(str);
5107 data = PyUnicode_DATA(str);
5108 len = PyUnicode_GET_LENGTH(str);
5109
5110 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005111 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005113 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114 if (v == NULL)
5115 return NULL;
5116
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005117 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005121 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005122
5123 if (byteorder == -1) {
5124 /* force LE */
5125 iorder[0] = 0;
5126 iorder[1] = 1;
5127 iorder[2] = 2;
5128 iorder[3] = 3;
5129 }
5130 else if (byteorder == 1) {
5131 /* force BE */
5132 iorder[0] = 3;
5133 iorder[1] = 2;
5134 iorder[2] = 1;
5135 iorder[3] = 0;
5136 }
5137
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005138 for (i = 0; i < len; i++)
5139 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005140
5141 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005142 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143#undef STORECHAR
5144}
5145
Alexander Belopolsky40018472011-02-26 01:02:56 +00005146PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005147PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5148 Py_ssize_t size,
5149 const char *errors,
5150 int byteorder)
5151{
5152 PyObject *result;
5153 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5154 if (tmp == NULL)
5155 return NULL;
5156 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5157 Py_DECREF(tmp);
5158 return result;
5159}
5160
5161PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005162PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163{
Victor Stinnerb960b342011-11-20 19:12:52 +01005164 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005165}
5166
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167/* --- UTF-16 Codec ------------------------------------------------------- */
5168
Tim Peters772747b2001-08-09 22:21:55 +00005169PyObject *
5170PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 Py_ssize_t size,
5172 const char *errors,
5173 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174{
Walter Dörwald69652032004-09-07 20:24:22 +00005175 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5176}
5177
5178PyObject *
5179PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 Py_ssize_t size,
5181 const char *errors,
5182 int *byteorder,
5183 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005186 Py_ssize_t startinpos;
5187 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005188 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005189 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005190 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005191 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005192 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193 PyObject *errorHandler = NULL;
5194 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195
Tim Peters772747b2001-08-09 22:21:55 +00005196 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005197 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198
5199 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005200 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005202 /* Check for BOM marks (U+FEFF) in the input and adjust current
5203 byte order setting accordingly. In native mode, the leading BOM
5204 mark is skipped, in all other modes, it is copied to the output
5205 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005206 if (bo == 0 && size >= 2) {
5207 const Py_UCS4 bom = (q[1] << 8) | q[0];
5208 if (bom == 0xFEFF) {
5209 q += 2;
5210 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005212 else if (bom == 0xFFFE) {
5213 q += 2;
5214 bo = 1;
5215 }
5216 if (byteorder)
5217 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
Antoine Pitrou63065d72012-05-15 23:48:04 +02005220 if (q == e) {
5221 if (consumed)
5222 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005223 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005224 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005225
Christian Heimes743e0cd2012-10-17 23:52:17 +02005226#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005227 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005228#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005229 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005230#endif
Tim Peters772747b2001-08-09 22:21:55 +00005231
Antoine Pitrou63065d72012-05-15 23:48:04 +02005232 /* Note: size will always be longer than the resulting Unicode
5233 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005234 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005235 writer.min_length = (e - q + 1) / 2;
5236 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005237 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005238
Antoine Pitrou63065d72012-05-15 23:48:04 +02005239 while (1) {
5240 Py_UCS4 ch = 0;
5241 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005242 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005243 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005245 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005246 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005247 native_ordering);
5248 else
5249 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005250 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005251 native_ordering);
5252 } else if (kind == PyUnicode_2BYTE_KIND) {
5253 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005254 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005255 native_ordering);
5256 } else {
5257 assert(kind == PyUnicode_4BYTE_KIND);
5258 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005259 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005260 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005261 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005262 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005263
Antoine Pitrou63065d72012-05-15 23:48:04 +02005264 switch (ch)
5265 {
5266 case 0:
5267 /* remaining byte at the end? (size should be even) */
5268 if (q == e || consumed)
5269 goto End;
5270 errmsg = "truncated data";
5271 startinpos = ((const char *)q) - starts;
5272 endinpos = ((const char *)e) - starts;
5273 break;
5274 /* The remaining input chars are ignored if the callback
5275 chooses to skip the input */
5276 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005277 q -= 2;
5278 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005279 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005280 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005281 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005282 endinpos = ((const char *)e) - starts;
5283 break;
5284 case 2:
5285 errmsg = "illegal encoding";
5286 startinpos = ((const char *)q) - 2 - starts;
5287 endinpos = startinpos + 2;
5288 break;
5289 case 3:
5290 errmsg = "illegal UTF-16 surrogate";
5291 startinpos = ((const char *)q) - 4 - starts;
5292 endinpos = startinpos + 2;
5293 break;
5294 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005295 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005296 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 continue;
5298 }
5299
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005301 errors,
5302 &errorHandler,
5303 "utf16", errmsg,
5304 &starts,
5305 (const char **)&e,
5306 &startinpos,
5307 &endinpos,
5308 &exc,
5309 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005310 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 }
5313
Antoine Pitrou63065d72012-05-15 23:48:04 +02005314End:
Walter Dörwald69652032004-09-07 20:24:22 +00005315 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005317
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005318 Py_XDECREF(errorHandler);
5319 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005320 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005323 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 Py_XDECREF(errorHandler);
5325 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 return NULL;
5327}
5328
Tim Peters772747b2001-08-09 22:21:55 +00005329PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005330_PyUnicode_EncodeUTF16(PyObject *str,
5331 const char *errors,
5332 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005334 enum PyUnicode_Kind kind;
5335 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005336 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005337 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005338 unsigned short *out;
5339 Py_ssize_t bytesize;
5340 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005341#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005342 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005343#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005344 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005345#endif
5346
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005347 if (!PyUnicode_Check(str)) {
5348 PyErr_BadArgument();
5349 return NULL;
5350 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005351 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005352 return NULL;
5353 kind = PyUnicode_KIND(str);
5354 data = PyUnicode_DATA(str);
5355 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005356
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005357 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005358 if (kind == PyUnicode_4BYTE_KIND) {
5359 const Py_UCS4 *in = (const Py_UCS4 *)data;
5360 const Py_UCS4 *end = in + len;
5361 while (in < end)
5362 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005363 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005364 }
5365 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005367 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005368 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 if (v == NULL)
5370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005372 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005373 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005374 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005376 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005377 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005378 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005379
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005380 switch (kind) {
5381 case PyUnicode_1BYTE_KIND: {
5382 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5383 break;
Tim Peters772747b2001-08-09 22:21:55 +00005384 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005385 case PyUnicode_2BYTE_KIND: {
5386 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5387 break;
Tim Peters772747b2001-08-09 22:21:55 +00005388 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005389 case PyUnicode_4BYTE_KIND: {
5390 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5391 break;
5392 }
5393 default:
5394 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005395 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005396
5397 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005398 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399}
5400
Alexander Belopolsky40018472011-02-26 01:02:56 +00005401PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005402PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5403 Py_ssize_t size,
5404 const char *errors,
5405 int byteorder)
5406{
5407 PyObject *result;
5408 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5409 if (tmp == NULL)
5410 return NULL;
5411 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5412 Py_DECREF(tmp);
5413 return result;
5414}
5415
5416PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005417PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005419 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420}
5421
5422/* --- Unicode Escape Codec ----------------------------------------------- */
5423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005424/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5425 if all the escapes in the string make it still a valid ASCII string.
5426 Returns -1 if any escapes were found which cause the string to
5427 pop out of ASCII range. Otherwise returns the length of the
5428 required buffer to hold the string.
5429 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005430static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5432{
5433 const unsigned char *p = (const unsigned char *)s;
5434 const unsigned char *end = p + size;
5435 Py_ssize_t length = 0;
5436
5437 if (size < 0)
5438 return -1;
5439
5440 for (; p < end; ++p) {
5441 if (*p > 127) {
5442 /* Non-ASCII */
5443 return -1;
5444 }
5445 else if (*p != '\\') {
5446 /* Normal character */
5447 ++length;
5448 }
5449 else {
5450 /* Backslash-escape, check next char */
5451 ++p;
5452 /* Escape sequence reaches till end of string or
5453 non-ASCII follow-up. */
5454 if (p >= end || *p > 127)
5455 return -1;
5456 switch (*p) {
5457 case '\n':
5458 /* backslash + \n result in zero characters */
5459 break;
5460 case '\\': case '\'': case '\"':
5461 case 'b': case 'f': case 't':
5462 case 'n': case 'r': case 'v': case 'a':
5463 ++length;
5464 break;
5465 case '0': case '1': case '2': case '3':
5466 case '4': case '5': case '6': case '7':
5467 case 'x': case 'u': case 'U': case 'N':
5468 /* these do not guarantee ASCII characters */
5469 return -1;
5470 default:
5471 /* count the backslash + the other character */
5472 length += 2;
5473 }
5474 }
5475 }
5476 return length;
5477}
5478
Fredrik Lundh06d12682001-01-24 07:59:11 +00005479static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005480
Alexander Belopolsky40018472011-02-26 01:02:56 +00005481PyObject *
5482PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005483 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005484 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005487 Py_ssize_t startinpos;
5488 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005489 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005491 char* message;
5492 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 PyObject *errorHandler = NULL;
5494 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005495 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005496
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005497 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005498 if (len == 0)
5499 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005500
5501 /* After length_of_escaped_ascii_string() there are two alternatives,
5502 either the string is pure ASCII with named escapes like \n, etc.
5503 and we determined it's exact size (common case)
5504 or it contains \x, \u, ... escape sequences. then we create a
5505 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005506 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005507 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005508 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509 }
5510 else {
5511 /* Escaped strings will always be longer than the resulting
5512 Unicode string, so we start with size here and then reduce the
5513 length after conversion to the true value.
5514 (but if the error callback returns a long replacement string
5515 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005516 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517 }
5518
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005522
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 while (s < end) {
5524 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005525 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
5528 /* Non-escape characters are interpreted as Unicode ordinals */
5529 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005530 x = (unsigned char)*s;
5531 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005532 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005533 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 continue;
5535 }
5536
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 /* \ - Escapes */
5539 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005540 c = *s++;
5541 if (s > end)
5542 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005544 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005547#define WRITECHAR(ch) \
5548 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005549 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005550 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005551 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005552
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005554 case '\\': WRITECHAR('\\'); break;
5555 case '\'': WRITECHAR('\''); break;
5556 case '\"': WRITECHAR('\"'); break;
5557 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559 case 'f': WRITECHAR('\014'); break;
5560 case 't': WRITECHAR('\t'); break;
5561 case 'n': WRITECHAR('\n'); break;
5562 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005564 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005565 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005566 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 case '0': case '1': case '2': case '3':
5570 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005571 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005572 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005573 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005574 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005575 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005577 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 break;
5579
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 /* hex escapes */
5581 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005583 digits = 2;
5584 message = "truncated \\xXX escape";
5585 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005589 digits = 4;
5590 message = "truncated \\uXXXX escape";
5591 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005594 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005595 digits = 8;
5596 message = "truncated \\UXXXXXXXX escape";
5597 hexescape:
5598 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005599 if (end - s < digits) {
5600 /* count only hex digits */
5601 for (; s < end; ++s) {
5602 c = (unsigned char)*s;
5603 if (!Py_ISXDIGIT(c))
5604 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005605 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005606 goto error;
5607 }
5608 for (; digits--; ++s) {
5609 c = (unsigned char)*s;
5610 if (!Py_ISXDIGIT(c))
5611 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005612 chr = (chr<<4) & ~0xF;
5613 if (c >= '0' && c <= '9')
5614 chr += c - '0';
5615 else if (c >= 'a' && c <= 'f')
5616 chr += 10 + c - 'a';
5617 else
5618 chr += 10 + c - 'A';
5619 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005620 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005621 /* _decoding_error will have already written into the
5622 target buffer. */
5623 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005624 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005625 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005626 message = "illegal Unicode character";
5627 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005628 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005629 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005630 break;
5631
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005633 case 'N':
5634 message = "malformed \\N character escape";
5635 if (ucnhash_CAPI == NULL) {
5636 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005637 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5638 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005639 if (ucnhash_CAPI == NULL)
5640 goto ucnhashError;
5641 }
5642 if (*s == '{') {
5643 const char *start = s+1;
5644 /* look for the closing brace */
5645 while (*s != '}' && s < end)
5646 s++;
5647 if (s > start && s < end && *s == '}') {
5648 /* found a name. look it up in the unicode database */
5649 message = "unknown Unicode character name";
5650 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005651 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005652 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005653 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005654 goto store;
5655 }
5656 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005657 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005658
5659 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005660 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 message = "\\ at end of string";
5662 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005663 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005664 }
5665 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005666 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005667 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005668 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005669 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005671 continue;
5672
5673 error:
5674 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005675 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005676 errors, &errorHandler,
5677 "unicodeescape", message,
5678 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005679 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005680 goto onError;
5681 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005683#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005685 Py_XDECREF(errorHandler);
5686 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005688
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005690 PyErr_SetString(
5691 PyExc_UnicodeError,
5692 "\\N escapes not supported (can't load unicodedata module)"
5693 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 Py_XDECREF(errorHandler);
5696 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005697 return NULL;
5698
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005700 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 Py_XDECREF(errorHandler);
5702 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 return NULL;
5704}
5705
5706/* Return a Unicode-Escape string version of the Unicode object.
5707
5708 If quotes is true, the string is enclosed in u"" or u'' quotes as
5709 appropriate.
5710
5711*/
5712
Alexander Belopolsky40018472011-02-26 01:02:56 +00005713PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005714PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005716 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005717 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719 int kind;
5720 void *data;
5721 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722
Ezio Melottie7f90372012-10-05 03:33:31 +03005723 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005724 escape.
5725
Ezio Melottie7f90372012-10-05 03:33:31 +03005726 For UCS1 strings it's '\xxx', 4 bytes per source character.
5727 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5728 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005729 */
5730
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005731 if (!PyUnicode_Check(unicode)) {
5732 PyErr_BadArgument();
5733 return NULL;
5734 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005735 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 return NULL;
5737 len = PyUnicode_GET_LENGTH(unicode);
5738 kind = PyUnicode_KIND(unicode);
5739 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005740 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005741 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5742 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5743 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5744 }
5745
5746 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005747 return PyBytes_FromStringAndSize(NULL, 0);
5748
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005749 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005751
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005752 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 if (repr == NULL)
5757 return NULL;
5758
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005759 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005761 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005762 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005763
Walter Dörwald79e913e2007-05-12 11:08:06 +00005764 /* Escape backslashes */
5765 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 *p++ = '\\';
5767 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005768 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005769 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005770
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005771 /* Map 21-bit characters to '\U00xxxxxx' */
5772 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005773 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005774 *p++ = '\\';
5775 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005776 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5777 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5778 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5779 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5780 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5781 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5782 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5783 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005785 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005786
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005788 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 *p++ = '\\';
5790 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005791 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5792 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5793 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5794 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005796
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005797 /* Map special whitespace to '\t', \n', '\r' */
5798 else if (ch == '\t') {
5799 *p++ = '\\';
5800 *p++ = 't';
5801 }
5802 else if (ch == '\n') {
5803 *p++ = '\\';
5804 *p++ = 'n';
5805 }
5806 else if (ch == '\r') {
5807 *p++ = '\\';
5808 *p++ = 'r';
5809 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005810
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005811 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005812 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005814 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005815 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5816 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005817 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005818
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 /* Copy everything else as-is */
5820 else
5821 *p++ = (char) ch;
5822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005824 assert(p - PyBytes_AS_STRING(repr) > 0);
5825 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5826 return NULL;
5827 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828}
5829
Alexander Belopolsky40018472011-02-26 01:02:56 +00005830PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005831PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5832 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005834 PyObject *result;
5835 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5836 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005838 result = PyUnicode_AsUnicodeEscapeString(tmp);
5839 Py_DECREF(tmp);
5840 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841}
5842
5843/* --- Raw Unicode Escape Codec ------------------------------------------- */
5844
Alexander Belopolsky40018472011-02-26 01:02:56 +00005845PyObject *
5846PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005847 Py_ssize_t size,
5848 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005851 Py_ssize_t startinpos;
5852 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 const char *end;
5855 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 PyObject *errorHandler = NULL;
5857 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005858
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005859 if (size == 0)
5860 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 /* Escaped strings will always be longer than the resulting
5863 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 length after conversion to the true value. (But decoding error
5865 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005866 _PyUnicodeWriter_Init(&writer);
5867 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005868
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 end = s + size;
5870 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 unsigned char c;
5872 Py_UCS4 x;
5873 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005874 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 /* Non-escape characters are interpreted as Unicode ordinals */
5877 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005878 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005879 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005880 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 startinpos = s-starts;
5884
5885 /* \u-escapes are only interpreted iff the number of leading
5886 backslashes if odd */
5887 bs = s;
5888 for (;s < end;) {
5889 if (*s != '\\')
5890 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005891 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005892 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005893 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 }
5895 if (((s - bs) & 1) == 0 ||
5896 s >= end ||
5897 (*s != 'u' && *s != 'U')) {
5898 continue;
5899 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 count = *s=='u' ? 4 : 8;
5902 s++;
5903
5904 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 for (x = 0, i = 0; i < count; ++i, ++s) {
5906 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005907 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005909 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 errors, &errorHandler,
5911 "rawunicodeescape", "truncated \\uXXXX",
5912 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005913 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 goto onError;
5915 goto nextByte;
5916 }
5917 x = (x<<4) & ~0xF;
5918 if (c >= '0' && c <= '9')
5919 x += c - '0';
5920 else if (c >= 'a' && c <= 'f')
5921 x += 10 + c - 'a';
5922 else
5923 x += 10 + c - 'A';
5924 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005925 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005926 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005927 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005928 }
5929 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005930 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005931 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005932 errors, &errorHandler,
5933 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005935 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005937 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 nextByte:
5939 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941 Py_XDECREF(errorHandler);
5942 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005943 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005944
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005946 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 Py_XDECREF(errorHandler);
5948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 return NULL;
5950}
5951
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952
Alexander Belopolsky40018472011-02-26 01:02:56 +00005953PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005956 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 char *p;
5958 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005959 Py_ssize_t expandsize, pos;
5960 int kind;
5961 void *data;
5962 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005964 if (!PyUnicode_Check(unicode)) {
5965 PyErr_BadArgument();
5966 return NULL;
5967 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005968 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005969 return NULL;
5970 kind = PyUnicode_KIND(unicode);
5971 data = PyUnicode_DATA(unicode);
5972 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005973 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5974 bytes, and 1 byte characters 4. */
5975 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005976
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005977 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005979
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005980 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 if (repr == NULL)
5982 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005983 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005984 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005986 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005987 for (pos = 0; pos < len; pos++) {
5988 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 /* Map 32-bit characters to '\Uxxxxxxxx' */
5990 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005991 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005992 *p++ = '\\';
5993 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005994 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5995 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5996 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5997 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5998 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5999 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6000 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6001 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006002 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006004 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 *p++ = '\\';
6006 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006007 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6008 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6009 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6010 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 /* Copy everything else as-is */
6013 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 *p++ = (char) ch;
6015 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006016
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006017 assert(p > q);
6018 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006019 return NULL;
6020 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021}
6022
Alexander Belopolsky40018472011-02-26 01:02:56 +00006023PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6025 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027 PyObject *result;
6028 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6029 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006030 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6032 Py_DECREF(tmp);
6033 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034}
6035
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006036/* --- Unicode Internal Codec ------------------------------------------- */
6037
Alexander Belopolsky40018472011-02-26 01:02:56 +00006038PyObject *
6039_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006040 Py_ssize_t size,
6041 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006042{
6043 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006044 Py_ssize_t startinpos;
6045 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006046 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006047 const char *end;
6048 const char *reason;
6049 PyObject *errorHandler = NULL;
6050 PyObject *exc = NULL;
6051
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006052 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006053 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006054 1))
6055 return NULL;
6056
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006057 if (size == 0)
6058 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006059
Victor Stinner8f674cc2013-04-17 23:02:17 +02006060 _PyUnicodeWriter_Init(&writer);
6061 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6062 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006064 }
6065 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006066
Victor Stinner8f674cc2013-04-17 23:02:17 +02006067 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006068 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006069 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006070 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006071 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006072 endinpos = end-starts;
6073 reason = "truncated input";
6074 goto error;
6075 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006076 /* We copy the raw representation one byte at a time because the
6077 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006078 ((char *) &uch)[0] = s[0];
6079 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006080#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006081 ((char *) &uch)[2] = s[2];
6082 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006083#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006084 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006085#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006086 /* We have to sanity check the raw data, otherwise doom looms for
6087 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006088 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006089 endinpos = s - starts + Py_UNICODE_SIZE;
6090 reason = "illegal code point (> 0x10FFFF)";
6091 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006092 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006093#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006094 s += Py_UNICODE_SIZE;
6095#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006096 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006097 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006098 Py_UNICODE uch2;
6099 ((char *) &uch2)[0] = s[0];
6100 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006101 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006102 {
Victor Stinner551ac952011-11-29 22:58:13 +01006103 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006104 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006105 }
6106 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006107#endif
6108
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006109 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006110 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006111 continue;
6112
6113 error:
6114 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006115 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006116 errors, &errorHandler,
6117 "unicode_internal", reason,
6118 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006119 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006120 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121 }
6122
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006123 Py_XDECREF(errorHandler);
6124 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006125 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006126
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006128 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006129 Py_XDECREF(errorHandler);
6130 Py_XDECREF(exc);
6131 return NULL;
6132}
6133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134/* --- Latin-1 Codec ------------------------------------------------------ */
6135
Alexander Belopolsky40018472011-02-26 01:02:56 +00006136PyObject *
6137PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006138 Py_ssize_t size,
6139 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006142 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143}
6144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006146static void
6147make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006148 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006149 PyObject *unicode,
6150 Py_ssize_t startpos, Py_ssize_t endpos,
6151 const char *reason)
6152{
6153 if (*exceptionObject == NULL) {
6154 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006155 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006156 encoding, unicode, startpos, endpos, reason);
6157 }
6158 else {
6159 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6160 goto onError;
6161 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6162 goto onError;
6163 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6164 goto onError;
6165 return;
6166 onError:
6167 Py_DECREF(*exceptionObject);
6168 *exceptionObject = NULL;
6169 }
6170}
6171
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006172/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006173static void
6174raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006175 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006176 PyObject *unicode,
6177 Py_ssize_t startpos, Py_ssize_t endpos,
6178 const char *reason)
6179{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006180 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006181 encoding, unicode, startpos, endpos, reason);
6182 if (*exceptionObject != NULL)
6183 PyCodec_StrictErrors(*exceptionObject);
6184}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185
6186/* error handling callback helper:
6187 build arguments, call the callback and check the arguments,
6188 put the result into newpos and return the replacement string, which
6189 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006190static PyObject *
6191unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006192 PyObject **errorHandler,
6193 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006194 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006195 Py_ssize_t startpos, Py_ssize_t endpos,
6196 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006198 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006199 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006200 PyObject *restuple;
6201 PyObject *resunicode;
6202
6203 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006205 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 }
6208
Benjamin Petersonbac79492012-01-14 13:34:47 -05006209 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006210 return NULL;
6211 len = PyUnicode_GET_LENGTH(unicode);
6212
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006213 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006214 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006215 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217
6218 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006220 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006223 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 Py_DECREF(restuple);
6225 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006226 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006227 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 &resunicode, newpos)) {
6229 Py_DECREF(restuple);
6230 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006231 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006232 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6233 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6234 Py_DECREF(restuple);
6235 return NULL;
6236 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006237 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006238 *newpos = len + *newpos;
6239 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6241 Py_DECREF(restuple);
6242 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006243 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006244 Py_INCREF(resunicode);
6245 Py_DECREF(restuple);
6246 return resunicode;
6247}
6248
Alexander Belopolsky40018472011-02-26 01:02:56 +00006249static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006250unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006251 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006252 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006253{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006254 /* input state */
6255 Py_ssize_t pos=0, size;
6256 int kind;
6257 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258 /* output object */
6259 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006260 /* pointer into the output */
6261 char *str;
6262 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006263 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006264 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6265 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006266 PyObject *errorHandler = NULL;
6267 PyObject *exc = NULL;
6268 /* the following variable is used for caching string comparisons
6269 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6270 int known_errorHandler = -1;
6271
Benjamin Petersonbac79492012-01-14 13:34:47 -05006272 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006273 return NULL;
6274 size = PyUnicode_GET_LENGTH(unicode);
6275 kind = PyUnicode_KIND(unicode);
6276 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277 /* allocate enough for a simple encoding without
6278 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006279 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006280 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006281 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006282 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006283 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006284 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 ressize = size;
6286
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006287 while (pos < size) {
6288 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 /* can we encode this? */
6291 if (c<limit) {
6292 /* no overflow check, because we know that the space is enough */
6293 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006294 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006295 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 Py_ssize_t requiredsize;
6298 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006299 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006301 Py_ssize_t collstart = pos;
6302 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006304 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 ++collend;
6306 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6307 if (known_errorHandler==-1) {
6308 if ((errors==NULL) || (!strcmp(errors, "strict")))
6309 known_errorHandler = 1;
6310 else if (!strcmp(errors, "replace"))
6311 known_errorHandler = 2;
6312 else if (!strcmp(errors, "ignore"))
6313 known_errorHandler = 3;
6314 else if (!strcmp(errors, "xmlcharrefreplace"))
6315 known_errorHandler = 4;
6316 else
6317 known_errorHandler = 0;
6318 }
6319 switch (known_errorHandler) {
6320 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006321 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 goto onError;
6323 case 2: /* replace */
6324 while (collstart++<collend)
6325 *str++ = '?'; /* fall through */
6326 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006327 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 break;
6329 case 4: /* xmlcharrefreplace */
6330 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006331 /* determine replacement size */
6332 for (i = collstart, repsize = 0; i < collend; ++i) {
6333 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6334 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006336 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006338 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006340 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006342 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006344 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006346 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006347 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 if (requiredsize > ressize) {
6353 if (requiredsize<2*ressize)
6354 requiredsize = 2*ressize;
6355 if (_PyBytes_Resize(&res, requiredsize))
6356 goto onError;
6357 str = PyBytes_AS_STRING(res) + respos;
6358 ressize = requiredsize;
6359 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006360 /* generate replacement */
6361 for (i = collstart; i < collend; ++i) {
6362 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 break;
6366 default:
6367 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 encoding, reason, unicode, &exc,
6369 collstart, collend, &newpos);
6370 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006371 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006373 if (PyBytes_Check(repunicode)) {
6374 /* Directly copy bytes result to output. */
6375 repsize = PyBytes_Size(repunicode);
6376 if (repsize > 1) {
6377 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006378 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006379 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6380 Py_DECREF(repunicode);
6381 goto onError;
6382 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006383 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006384 ressize += repsize-1;
6385 }
6386 memcpy(str, PyBytes_AsString(repunicode), repsize);
6387 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006389 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006390 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006391 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 /* need more space? (at least enough for what we
6393 have+the replacement+the rest of the string, so
6394 we won't have to check space for encodable characters) */
6395 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006396 repsize = PyUnicode_GET_LENGTH(repunicode);
6397 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 if (requiredsize > ressize) {
6399 if (requiredsize<2*ressize)
6400 requiredsize = 2*ressize;
6401 if (_PyBytes_Resize(&res, requiredsize)) {
6402 Py_DECREF(repunicode);
6403 goto onError;
6404 }
6405 str = PyBytes_AS_STRING(res) + respos;
6406 ressize = requiredsize;
6407 }
6408 /* check if there is anything unencodable in the replacement
6409 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006410 for (i = 0; repsize-->0; ++i, ++str) {
6411 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006413 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 Py_DECREF(repunicode);
6416 goto onError;
6417 }
6418 *str = (char)c;
6419 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006421 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006423 }
6424 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006425 /* Resize if we allocated to much */
6426 size = str - PyBytes_AS_STRING(res);
6427 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006428 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006429 if (_PyBytes_Resize(&res, size) < 0)
6430 goto onError;
6431 }
6432
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 Py_XDECREF(errorHandler);
6434 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006435 return res;
6436
6437 onError:
6438 Py_XDECREF(res);
6439 Py_XDECREF(errorHandler);
6440 Py_XDECREF(exc);
6441 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442}
6443
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006445PyObject *
6446PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006447 Py_ssize_t size,
6448 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006450 PyObject *result;
6451 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6452 if (unicode == NULL)
6453 return NULL;
6454 result = unicode_encode_ucs1(unicode, errors, 256);
6455 Py_DECREF(unicode);
6456 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457}
6458
Alexander Belopolsky40018472011-02-26 01:02:56 +00006459PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006460_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461{
6462 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 PyErr_BadArgument();
6464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006466 if (PyUnicode_READY(unicode) == -1)
6467 return NULL;
6468 /* Fast path: if it is a one-byte string, construct
6469 bytes object directly. */
6470 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6471 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6472 PyUnicode_GET_LENGTH(unicode));
6473 /* Non-Latin-1 characters present. Defer to above function to
6474 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006475 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006476}
6477
6478PyObject*
6479PyUnicode_AsLatin1String(PyObject *unicode)
6480{
6481 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482}
6483
6484/* --- 7-bit ASCII Codec -------------------------------------------------- */
6485
Alexander Belopolsky40018472011-02-26 01:02:56 +00006486PyObject *
6487PyUnicode_DecodeASCII(const char *s,
6488 Py_ssize_t size,
6489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006492 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006493 int kind;
6494 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 Py_ssize_t startinpos;
6496 Py_ssize_t endinpos;
6497 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498 const char *e;
6499 PyObject *errorHandler = NULL;
6500 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006501
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006503 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006504
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006506 if (size == 1 && (unsigned char)s[0] < 128)
6507 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006508
Victor Stinner8f674cc2013-04-17 23:02:17 +02006509 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006510 writer.min_length = size;
6511 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006512 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006514 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006515 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006516 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006517 writer.pos = outpos;
6518 if (writer.pos == size)
6519 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006520
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006521 s += writer.pos;
6522 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006524 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006526 PyUnicode_WRITE(kind, data, writer.pos, c);
6527 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 ++s;
6529 }
6530 else {
6531 startinpos = s-starts;
6532 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006533 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 errors, &errorHandler,
6535 "ascii", "ordinal not in range(128)",
6536 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006537 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006539 kind = writer.kind;
6540 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 Py_XDECREF(errorHandler);
6544 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006545 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006546
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006548 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006549 Py_XDECREF(errorHandler);
6550 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 return NULL;
6552}
6553
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006555PyObject *
6556PyUnicode_EncodeASCII(const Py_UNICODE *p,
6557 Py_ssize_t size,
6558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 PyObject *result;
6561 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6562 if (unicode == NULL)
6563 return NULL;
6564 result = unicode_encode_ucs1(unicode, errors, 128);
6565 Py_DECREF(unicode);
6566 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567}
6568
Alexander Belopolsky40018472011-02-26 01:02:56 +00006569PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006570_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
6572 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 PyErr_BadArgument();
6574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006576 if (PyUnicode_READY(unicode) == -1)
6577 return NULL;
6578 /* Fast path: if it is an ASCII-only string, construct bytes object
6579 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006580 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006581 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6582 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006583 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006584}
6585
6586PyObject *
6587PyUnicode_AsASCIIString(PyObject *unicode)
6588{
6589 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590}
6591
Victor Stinner99b95382011-07-04 14:23:54 +02006592#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006593
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006594/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006595
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006596#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006597#define NEED_RETRY
6598#endif
6599
Victor Stinner3a50e702011-10-18 21:21:00 +02006600#ifndef WC_ERR_INVALID_CHARS
6601# define WC_ERR_INVALID_CHARS 0x0080
6602#endif
6603
6604static char*
6605code_page_name(UINT code_page, PyObject **obj)
6606{
6607 *obj = NULL;
6608 if (code_page == CP_ACP)
6609 return "mbcs";
6610 if (code_page == CP_UTF7)
6611 return "CP_UTF7";
6612 if (code_page == CP_UTF8)
6613 return "CP_UTF8";
6614
6615 *obj = PyBytes_FromFormat("cp%u", code_page);
6616 if (*obj == NULL)
6617 return NULL;
6618 return PyBytes_AS_STRING(*obj);
6619}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006620
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006622is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006623{
6624 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006625 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626
Victor Stinner3a50e702011-10-18 21:21:00 +02006627 if (!IsDBCSLeadByteEx(code_page, *curr))
6628 return 0;
6629
6630 prev = CharPrevExA(code_page, s, curr, 0);
6631 if (prev == curr)
6632 return 1;
6633 /* FIXME: This code is limited to "true" double-byte encodings,
6634 as it assumes an incomplete character consists of a single
6635 byte. */
6636 if (curr - prev == 2)
6637 return 1;
6638 if (!IsDBCSLeadByteEx(code_page, *prev))
6639 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006640 return 0;
6641}
6642
Victor Stinner3a50e702011-10-18 21:21:00 +02006643static DWORD
6644decode_code_page_flags(UINT code_page)
6645{
6646 if (code_page == CP_UTF7) {
6647 /* The CP_UTF7 decoder only supports flags=0 */
6648 return 0;
6649 }
6650 else
6651 return MB_ERR_INVALID_CHARS;
6652}
6653
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006654/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006655 * Decode a byte string from a Windows code page into unicode object in strict
6656 * mode.
6657 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006658 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6659 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006660 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006661static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006662decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006663 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006664 const char *in,
6665 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006666{
Victor Stinner3a50e702011-10-18 21:21:00 +02006667 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006668 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006669 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006670
6671 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006672 assert(insize > 0);
6673 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6674 if (outsize <= 0)
6675 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006676
6677 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006679 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006680 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 if (*v == NULL)
6682 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006683 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006684 }
6685 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006687 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006688 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006690 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006691 }
6692
6693 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006694 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6695 if (outsize <= 0)
6696 goto error;
6697 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006698
Victor Stinner3a50e702011-10-18 21:21:00 +02006699error:
6700 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6701 return -2;
6702 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006703 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006704}
6705
Victor Stinner3a50e702011-10-18 21:21:00 +02006706/*
6707 * Decode a byte string from a code page into unicode object with an error
6708 * handler.
6709 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006710 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006711 * UnicodeDecodeError exception and returns -1 on error.
6712 */
6713static int
6714decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006715 PyObject **v,
6716 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006717 const char *errors)
6718{
6719 const char *startin = in;
6720 const char *endin = in + size;
6721 const DWORD flags = decode_code_page_flags(code_page);
6722 /* Ideally, we should get reason from FormatMessage. This is the Windows
6723 2000 English version of the message. */
6724 const char *reason = "No mapping for the Unicode character exists "
6725 "in the target code page.";
6726 /* each step cannot decode more than 1 character, but a character can be
6727 represented as a surrogate pair */
6728 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006729 int insize;
6730 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 PyObject *errorHandler = NULL;
6732 PyObject *exc = NULL;
6733 PyObject *encoding_obj = NULL;
6734 char *encoding;
6735 DWORD err;
6736 int ret = -1;
6737
6738 assert(size > 0);
6739
6740 encoding = code_page_name(code_page, &encoding_obj);
6741 if (encoding == NULL)
6742 return -1;
6743
6744 if (errors == NULL || strcmp(errors, "strict") == 0) {
6745 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6746 UnicodeDecodeError. */
6747 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6748 if (exc != NULL) {
6749 PyCodec_StrictErrors(exc);
6750 Py_CLEAR(exc);
6751 }
6752 goto error;
6753 }
6754
6755 if (*v == NULL) {
6756 /* Create unicode object */
6757 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6758 PyErr_NoMemory();
6759 goto error;
6760 }
Victor Stinnerab595942011-12-17 04:59:06 +01006761 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006762 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006763 if (*v == NULL)
6764 goto error;
6765 startout = PyUnicode_AS_UNICODE(*v);
6766 }
6767 else {
6768 /* Extend unicode object */
6769 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6770 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6771 PyErr_NoMemory();
6772 goto error;
6773 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006774 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006775 goto error;
6776 startout = PyUnicode_AS_UNICODE(*v) + n;
6777 }
6778
6779 /* Decode the byte string character per character */
6780 out = startout;
6781 while (in < endin)
6782 {
6783 /* Decode a character */
6784 insize = 1;
6785 do
6786 {
6787 outsize = MultiByteToWideChar(code_page, flags,
6788 in, insize,
6789 buffer, Py_ARRAY_LENGTH(buffer));
6790 if (outsize > 0)
6791 break;
6792 err = GetLastError();
6793 if (err != ERROR_NO_UNICODE_TRANSLATION
6794 && err != ERROR_INSUFFICIENT_BUFFER)
6795 {
6796 PyErr_SetFromWindowsErr(0);
6797 goto error;
6798 }
6799 insize++;
6800 }
6801 /* 4=maximum length of a UTF-8 sequence */
6802 while (insize <= 4 && (in + insize) <= endin);
6803
6804 if (outsize <= 0) {
6805 Py_ssize_t startinpos, endinpos, outpos;
6806
6807 startinpos = in - startin;
6808 endinpos = startinpos + 1;
6809 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006810 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006811 errors, &errorHandler,
6812 encoding, reason,
6813 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006814 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006815 {
6816 goto error;
6817 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006818 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006819 }
6820 else {
6821 in += insize;
6822 memcpy(out, buffer, outsize * sizeof(wchar_t));
6823 out += outsize;
6824 }
6825 }
6826
6827 /* write a NUL character at the end */
6828 *out = 0;
6829
6830 /* Extend unicode object */
6831 outsize = out - startout;
6832 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006833 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006834 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006835 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006836
6837error:
6838 Py_XDECREF(encoding_obj);
6839 Py_XDECREF(errorHandler);
6840 Py_XDECREF(exc);
6841 return ret;
6842}
6843
Victor Stinner3a50e702011-10-18 21:21:00 +02006844static PyObject *
6845decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006846 const char *s, Py_ssize_t size,
6847 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848{
Victor Stinner76a31a62011-11-04 00:05:13 +01006849 PyObject *v = NULL;
6850 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006851
Victor Stinner3a50e702011-10-18 21:21:00 +02006852 if (code_page < 0) {
6853 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6854 return NULL;
6855 }
6856
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006857 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006859
Victor Stinner76a31a62011-11-04 00:05:13 +01006860 do
6861 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006863 if (size > INT_MAX) {
6864 chunk_size = INT_MAX;
6865 final = 0;
6866 done = 0;
6867 }
6868 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006870 {
6871 chunk_size = (int)size;
6872 final = (consumed == NULL);
6873 done = 1;
6874 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006875
Victor Stinner76a31a62011-11-04 00:05:13 +01006876 /* Skip trailing lead-byte unless 'final' is set */
6877 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6878 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879
Victor Stinner76a31a62011-11-04 00:05:13 +01006880 if (chunk_size == 0 && done) {
6881 if (v != NULL)
6882 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006883 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006884 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885
Victor Stinner76a31a62011-11-04 00:05:13 +01006886
6887 converted = decode_code_page_strict(code_page, &v,
6888 s, chunk_size);
6889 if (converted == -2)
6890 converted = decode_code_page_errors(code_page, &v,
6891 s, chunk_size,
6892 errors);
6893 assert(converted != 0);
6894
6895 if (converted < 0) {
6896 Py_XDECREF(v);
6897 return NULL;
6898 }
6899
6900 if (consumed)
6901 *consumed += converted;
6902
6903 s += converted;
6904 size -= converted;
6905 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006906
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006907 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006908}
6909
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006911PyUnicode_DecodeCodePageStateful(int code_page,
6912 const char *s,
6913 Py_ssize_t size,
6914 const char *errors,
6915 Py_ssize_t *consumed)
6916{
6917 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6918}
6919
6920PyObject *
6921PyUnicode_DecodeMBCSStateful(const char *s,
6922 Py_ssize_t size,
6923 const char *errors,
6924 Py_ssize_t *consumed)
6925{
6926 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6927}
6928
6929PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930PyUnicode_DecodeMBCS(const char *s,
6931 Py_ssize_t size,
6932 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006933{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6935}
6936
Victor Stinner3a50e702011-10-18 21:21:00 +02006937static DWORD
6938encode_code_page_flags(UINT code_page, const char *errors)
6939{
6940 if (code_page == CP_UTF8) {
6941 if (winver.dwMajorVersion >= 6)
6942 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6943 and later */
6944 return WC_ERR_INVALID_CHARS;
6945 else
6946 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6947 return 0;
6948 }
6949 else if (code_page == CP_UTF7) {
6950 /* CP_UTF7 only supports flags=0 */
6951 return 0;
6952 }
6953 else {
6954 if (errors != NULL && strcmp(errors, "replace") == 0)
6955 return 0;
6956 else
6957 return WC_NO_BEST_FIT_CHARS;
6958 }
6959}
6960
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006962 * Encode a Unicode string to a Windows code page into a byte string in strict
6963 * mode.
6964 *
6965 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006966 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006967 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006968static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006969encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006970 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006972{
Victor Stinner554f3f02010-06-16 23:33:54 +00006973 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006974 BOOL *pusedDefaultChar = &usedDefaultChar;
6975 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006976 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006977 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006978 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 const DWORD flags = encode_code_page_flags(code_page, NULL);
6980 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006981 /* Create a substring so that we can get the UTF-16 representation
6982 of just the slice under consideration. */
6983 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006984
Martin v. Löwis3d325192011-11-04 18:23:06 +01006985 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006986
Victor Stinner3a50e702011-10-18 21:21:00 +02006987 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006988 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006989 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006990 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006991
Victor Stinner2fc507f2011-11-04 20:06:39 +01006992 substring = PyUnicode_Substring(unicode, offset, offset+len);
6993 if (substring == NULL)
6994 return -1;
6995 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6996 if (p == NULL) {
6997 Py_DECREF(substring);
6998 return -1;
6999 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007000 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007001
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007002 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007003 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007004 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 NULL, 0,
7006 NULL, pusedDefaultChar);
7007 if (outsize <= 0)
7008 goto error;
7009 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007010 if (pusedDefaultChar && *pusedDefaultChar) {
7011 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007012 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007013 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007014
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007018 if (*outbytes == NULL) {
7019 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007021 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007022 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023 }
7024 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007026 const Py_ssize_t n = PyBytes_Size(*outbytes);
7027 if (outsize > PY_SSIZE_T_MAX - n) {
7028 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007029 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007031 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007032 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7033 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007034 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007035 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007036 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037 }
7038
7039 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007040 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007041 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 out, outsize,
7043 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007044 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007045 if (outsize <= 0)
7046 goto error;
7047 if (pusedDefaultChar && *pusedDefaultChar)
7048 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007049 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007050
Victor Stinner3a50e702011-10-18 21:21:00 +02007051error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007052 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7054 return -2;
7055 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007056 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007057}
7058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059/*
7060 * Encode a Unicode string to a Windows code page into a byte string using a
7061 * error handler.
7062 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007063 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007064 * -1 on other error.
7065 */
7066static int
7067encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007068 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007069 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007070{
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007072 Py_ssize_t pos = unicode_offset;
7073 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 /* Ideally, we should get reason from FormatMessage. This is the Windows
7075 2000 English version of the message. */
7076 const char *reason = "invalid character";
7077 /* 4=maximum length of a UTF-8 sequence */
7078 char buffer[4];
7079 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7080 Py_ssize_t outsize;
7081 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 PyObject *errorHandler = NULL;
7083 PyObject *exc = NULL;
7084 PyObject *encoding_obj = NULL;
7085 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007086 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 PyObject *rep;
7088 int ret = -1;
7089
7090 assert(insize > 0);
7091
7092 encoding = code_page_name(code_page, &encoding_obj);
7093 if (encoding == NULL)
7094 return -1;
7095
7096 if (errors == NULL || strcmp(errors, "strict") == 0) {
7097 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7098 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007099 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 if (exc != NULL) {
7101 PyCodec_StrictErrors(exc);
7102 Py_DECREF(exc);
7103 }
7104 Py_XDECREF(encoding_obj);
7105 return -1;
7106 }
7107
7108 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7109 pusedDefaultChar = &usedDefaultChar;
7110 else
7111 pusedDefaultChar = NULL;
7112
7113 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7114 PyErr_NoMemory();
7115 goto error;
7116 }
7117 outsize = insize * Py_ARRAY_LENGTH(buffer);
7118
7119 if (*outbytes == NULL) {
7120 /* Create string object */
7121 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7122 if (*outbytes == NULL)
7123 goto error;
7124 out = PyBytes_AS_STRING(*outbytes);
7125 }
7126 else {
7127 /* Extend string object */
7128 Py_ssize_t n = PyBytes_Size(*outbytes);
7129 if (n > PY_SSIZE_T_MAX - outsize) {
7130 PyErr_NoMemory();
7131 goto error;
7132 }
7133 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7134 goto error;
7135 out = PyBytes_AS_STRING(*outbytes) + n;
7136 }
7137
7138 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007139 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007141 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7142 wchar_t chars[2];
7143 int charsize;
7144 if (ch < 0x10000) {
7145 chars[0] = (wchar_t)ch;
7146 charsize = 1;
7147 }
7148 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007149 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7150 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007151 charsize = 2;
7152 }
7153
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007155 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 buffer, Py_ARRAY_LENGTH(buffer),
7157 NULL, pusedDefaultChar);
7158 if (outsize > 0) {
7159 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7160 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007161 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 memcpy(out, buffer, outsize);
7163 out += outsize;
7164 continue;
7165 }
7166 }
7167 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7168 PyErr_SetFromWindowsErr(0);
7169 goto error;
7170 }
7171
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 rep = unicode_encode_call_errorhandler(
7173 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007174 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007175 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 if (rep == NULL)
7177 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007178 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007179
7180 if (PyBytes_Check(rep)) {
7181 outsize = PyBytes_GET_SIZE(rep);
7182 if (outsize != 1) {
7183 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7184 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7185 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7186 Py_DECREF(rep);
7187 goto error;
7188 }
7189 out = PyBytes_AS_STRING(*outbytes) + offset;
7190 }
7191 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7192 out += outsize;
7193 }
7194 else {
7195 Py_ssize_t i;
7196 enum PyUnicode_Kind kind;
7197 void *data;
7198
Benjamin Petersonbac79492012-01-14 13:34:47 -05007199 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 Py_DECREF(rep);
7201 goto error;
7202 }
7203
7204 outsize = PyUnicode_GET_LENGTH(rep);
7205 if (outsize != 1) {
7206 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7207 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7208 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7209 Py_DECREF(rep);
7210 goto error;
7211 }
7212 out = PyBytes_AS_STRING(*outbytes) + offset;
7213 }
7214 kind = PyUnicode_KIND(rep);
7215 data = PyUnicode_DATA(rep);
7216 for (i=0; i < outsize; i++) {
7217 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7218 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007219 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007220 encoding, unicode,
7221 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 "unable to encode error handler result to ASCII");
7223 Py_DECREF(rep);
7224 goto error;
7225 }
7226 *out = (unsigned char)ch;
7227 out++;
7228 }
7229 }
7230 Py_DECREF(rep);
7231 }
7232 /* write a NUL byte */
7233 *out = 0;
7234 outsize = out - PyBytes_AS_STRING(*outbytes);
7235 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7236 if (_PyBytes_Resize(outbytes, outsize) < 0)
7237 goto error;
7238 ret = 0;
7239
7240error:
7241 Py_XDECREF(encoding_obj);
7242 Py_XDECREF(errorHandler);
7243 Py_XDECREF(exc);
7244 return ret;
7245}
7246
Victor Stinner3a50e702011-10-18 21:21:00 +02007247static PyObject *
7248encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007249 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 const char *errors)
7251{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007252 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007254 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007255 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007256
Benjamin Petersonbac79492012-01-14 13:34:47 -05007257 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007258 return NULL;
7259 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007260
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 if (code_page < 0) {
7262 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7263 return NULL;
7264 }
7265
Martin v. Löwis3d325192011-11-04 18:23:06 +01007266 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007267 return PyBytes_FromStringAndSize(NULL, 0);
7268
Victor Stinner7581cef2011-11-03 22:32:33 +01007269 offset = 0;
7270 do
7271 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007274 chunks. */
7275 if (len > INT_MAX/2) {
7276 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007277 done = 0;
7278 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007279 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007280#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007281 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007282 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007283 done = 1;
7284 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007285
Victor Stinner76a31a62011-11-04 00:05:13 +01007286 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007287 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 errors);
7289 if (ret == -2)
7290 ret = encode_code_page_errors(code_page, &outbytes,
7291 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007292 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007293 if (ret < 0) {
7294 Py_XDECREF(outbytes);
7295 return NULL;
7296 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007297
Victor Stinner7581cef2011-11-03 22:32:33 +01007298 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007299 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007300 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 return outbytes;
7303}
7304
7305PyObject *
7306PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7307 Py_ssize_t size,
7308 const char *errors)
7309{
Victor Stinner7581cef2011-11-03 22:32:33 +01007310 PyObject *unicode, *res;
7311 unicode = PyUnicode_FromUnicode(p, size);
7312 if (unicode == NULL)
7313 return NULL;
7314 res = encode_code_page(CP_ACP, unicode, errors);
7315 Py_DECREF(unicode);
7316 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007317}
7318
7319PyObject *
7320PyUnicode_EncodeCodePage(int code_page,
7321 PyObject *unicode,
7322 const char *errors)
7323{
Victor Stinner7581cef2011-11-03 22:32:33 +01007324 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007325}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007326
Alexander Belopolsky40018472011-02-26 01:02:56 +00007327PyObject *
7328PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007329{
7330 if (!PyUnicode_Check(unicode)) {
7331 PyErr_BadArgument();
7332 return NULL;
7333 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007334 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007335}
7336
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337#undef NEED_RETRY
7338
Victor Stinner99b95382011-07-04 14:23:54 +02007339#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007340
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341/* --- Character Mapping Codec -------------------------------------------- */
7342
Victor Stinnerfb161b12013-04-18 01:44:27 +02007343static int
7344charmap_decode_string(const char *s,
7345 Py_ssize_t size,
7346 PyObject *mapping,
7347 const char *errors,
7348 _PyUnicodeWriter *writer)
7349{
7350 const char *starts = s;
7351 const char *e;
7352 Py_ssize_t startinpos, endinpos;
7353 PyObject *errorHandler = NULL, *exc = NULL;
7354 Py_ssize_t maplen;
7355 enum PyUnicode_Kind mapkind;
7356 void *mapdata;
7357 Py_UCS4 x;
7358 unsigned char ch;
7359
7360 if (PyUnicode_READY(mapping) == -1)
7361 return -1;
7362
7363 maplen = PyUnicode_GET_LENGTH(mapping);
7364 mapdata = PyUnicode_DATA(mapping);
7365 mapkind = PyUnicode_KIND(mapping);
7366
7367 e = s + size;
7368
7369 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7370 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7371 * is disabled in encoding aliases, latin1 is preferred because
7372 * its implementation is faster. */
7373 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7374 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7375 Py_UCS4 maxchar = writer->maxchar;
7376
7377 assert (writer->kind == PyUnicode_1BYTE_KIND);
7378 while (s < e) {
7379 ch = *s;
7380 x = mapdata_ucs1[ch];
7381 if (x > maxchar) {
7382 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7383 goto onError;
7384 maxchar = writer->maxchar;
7385 outdata = (Py_UCS1 *)writer->data;
7386 }
7387 outdata[writer->pos] = x;
7388 writer->pos++;
7389 ++s;
7390 }
7391 return 0;
7392 }
7393
7394 while (s < e) {
7395 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7396 enum PyUnicode_Kind outkind = writer->kind;
7397 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7398 if (outkind == PyUnicode_1BYTE_KIND) {
7399 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7400 Py_UCS4 maxchar = writer->maxchar;
7401 while (s < e) {
7402 ch = *s;
7403 x = mapdata_ucs2[ch];
7404 if (x > maxchar)
7405 goto Error;
7406 outdata[writer->pos] = x;
7407 writer->pos++;
7408 ++s;
7409 }
7410 break;
7411 }
7412 else if (outkind == PyUnicode_2BYTE_KIND) {
7413 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7414 while (s < e) {
7415 ch = *s;
7416 x = mapdata_ucs2[ch];
7417 if (x == 0xFFFE)
7418 goto Error;
7419 outdata[writer->pos] = x;
7420 writer->pos++;
7421 ++s;
7422 }
7423 break;
7424 }
7425 }
7426 ch = *s;
7427
7428 if (ch < maplen)
7429 x = PyUnicode_READ(mapkind, mapdata, ch);
7430 else
7431 x = 0xfffe; /* invalid value */
7432Error:
7433 if (x == 0xfffe)
7434 {
7435 /* undefined mapping */
7436 startinpos = s-starts;
7437 endinpos = startinpos+1;
7438 if (unicode_decode_call_errorhandler_writer(
7439 errors, &errorHandler,
7440 "charmap", "character maps to <undefined>",
7441 &starts, &e, &startinpos, &endinpos, &exc, &s,
7442 writer)) {
7443 goto onError;
7444 }
7445 continue;
7446 }
7447
7448 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7449 goto onError;
7450 ++s;
7451 }
7452 Py_XDECREF(errorHandler);
7453 Py_XDECREF(exc);
7454 return 0;
7455
7456onError:
7457 Py_XDECREF(errorHandler);
7458 Py_XDECREF(exc);
7459 return -1;
7460}
7461
7462static int
7463charmap_decode_mapping(const char *s,
7464 Py_ssize_t size,
7465 PyObject *mapping,
7466 const char *errors,
7467 _PyUnicodeWriter *writer)
7468{
7469 const char *starts = s;
7470 const char *e;
7471 Py_ssize_t startinpos, endinpos;
7472 PyObject *errorHandler = NULL, *exc = NULL;
7473 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007474 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007475
7476 e = s + size;
7477
7478 while (s < e) {
7479 ch = *s;
7480
7481 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7482 key = PyLong_FromLong((long)ch);
7483 if (key == NULL)
7484 goto onError;
7485
7486 item = PyObject_GetItem(mapping, key);
7487 Py_DECREF(key);
7488 if (item == NULL) {
7489 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7490 /* No mapping found means: mapping is undefined. */
7491 PyErr_Clear();
7492 goto Undefined;
7493 } else
7494 goto onError;
7495 }
7496
7497 /* Apply mapping */
7498 if (item == Py_None)
7499 goto Undefined;
7500 if (PyLong_Check(item)) {
7501 long value = PyLong_AS_LONG(item);
7502 if (value == 0xFFFE)
7503 goto Undefined;
7504 if (value < 0 || value > MAX_UNICODE) {
7505 PyErr_Format(PyExc_TypeError,
7506 "character mapping must be in range(0x%lx)",
7507 (unsigned long)MAX_UNICODE + 1);
7508 goto onError;
7509 }
7510
7511 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7512 goto onError;
7513 }
7514 else if (PyUnicode_Check(item)) {
7515 if (PyUnicode_READY(item) == -1)
7516 goto onError;
7517 if (PyUnicode_GET_LENGTH(item) == 1) {
7518 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7519 if (value == 0xFFFE)
7520 goto Undefined;
7521 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7522 goto onError;
7523 }
7524 else {
7525 writer->overallocate = 1;
7526 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7527 goto onError;
7528 }
7529 }
7530 else {
7531 /* wrong return value */
7532 PyErr_SetString(PyExc_TypeError,
7533 "character mapping must return integer, None or str");
7534 goto onError;
7535 }
7536 Py_CLEAR(item);
7537 ++s;
7538 continue;
7539
7540Undefined:
7541 /* undefined mapping */
7542 Py_CLEAR(item);
7543 startinpos = s-starts;
7544 endinpos = startinpos+1;
7545 if (unicode_decode_call_errorhandler_writer(
7546 errors, &errorHandler,
7547 "charmap", "character maps to <undefined>",
7548 &starts, &e, &startinpos, &endinpos, &exc, &s,
7549 writer)) {
7550 goto onError;
7551 }
7552 }
7553 Py_XDECREF(errorHandler);
7554 Py_XDECREF(exc);
7555 return 0;
7556
7557onError:
7558 Py_XDECREF(item);
7559 Py_XDECREF(errorHandler);
7560 Py_XDECREF(exc);
7561 return -1;
7562}
7563
Alexander Belopolsky40018472011-02-26 01:02:56 +00007564PyObject *
7565PyUnicode_DecodeCharmap(const char *s,
7566 Py_ssize_t size,
7567 PyObject *mapping,
7568 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007570 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007571
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 /* Default to Latin-1 */
7573 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007577 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007578 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007579 writer.min_length = size;
7580 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007582
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007583 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007584 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7585 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007586 }
7587 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007588 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7589 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007591 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007592
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007594 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 return NULL;
7596}
7597
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007598/* Charmap encoding: the lookup table */
7599
Alexander Belopolsky40018472011-02-26 01:02:56 +00007600struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 PyObject_HEAD
7602 unsigned char level1[32];
7603 int count2, count3;
7604 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007605};
7606
7607static PyObject*
7608encoding_map_size(PyObject *obj, PyObject* args)
7609{
7610 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007613}
7614
7615static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007616 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 PyDoc_STR("Return the size (in bytes) of this object") },
7618 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619};
7620
7621static void
7622encoding_map_dealloc(PyObject* o)
7623{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007624 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007625}
7626
7627static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 "EncodingMap", /*tp_name*/
7630 sizeof(struct encoding_map), /*tp_basicsize*/
7631 0, /*tp_itemsize*/
7632 /* methods */
7633 encoding_map_dealloc, /*tp_dealloc*/
7634 0, /*tp_print*/
7635 0, /*tp_getattr*/
7636 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007637 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 0, /*tp_repr*/
7639 0, /*tp_as_number*/
7640 0, /*tp_as_sequence*/
7641 0, /*tp_as_mapping*/
7642 0, /*tp_hash*/
7643 0, /*tp_call*/
7644 0, /*tp_str*/
7645 0, /*tp_getattro*/
7646 0, /*tp_setattro*/
7647 0, /*tp_as_buffer*/
7648 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7649 0, /*tp_doc*/
7650 0, /*tp_traverse*/
7651 0, /*tp_clear*/
7652 0, /*tp_richcompare*/
7653 0, /*tp_weaklistoffset*/
7654 0, /*tp_iter*/
7655 0, /*tp_iternext*/
7656 encoding_map_methods, /*tp_methods*/
7657 0, /*tp_members*/
7658 0, /*tp_getset*/
7659 0, /*tp_base*/
7660 0, /*tp_dict*/
7661 0, /*tp_descr_get*/
7662 0, /*tp_descr_set*/
7663 0, /*tp_dictoffset*/
7664 0, /*tp_init*/
7665 0, /*tp_alloc*/
7666 0, /*tp_new*/
7667 0, /*tp_free*/
7668 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007669};
7670
7671PyObject*
7672PyUnicode_BuildEncodingMap(PyObject* string)
7673{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007674 PyObject *result;
7675 struct encoding_map *mresult;
7676 int i;
7677 int need_dict = 0;
7678 unsigned char level1[32];
7679 unsigned char level2[512];
7680 unsigned char *mlevel1, *mlevel2, *mlevel3;
7681 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007682 int kind;
7683 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007684 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007685 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007686
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007687 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007688 PyErr_BadArgument();
7689 return NULL;
7690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007691 kind = PyUnicode_KIND(string);
7692 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007693 length = PyUnicode_GET_LENGTH(string);
7694 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007695 memset(level1, 0xFF, sizeof level1);
7696 memset(level2, 0xFF, sizeof level2);
7697
7698 /* If there isn't a one-to-one mapping of NULL to \0,
7699 or if there are non-BMP characters, we need to use
7700 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007701 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007702 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007703 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007705 ch = PyUnicode_READ(kind, data, i);
7706 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007707 need_dict = 1;
7708 break;
7709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007710 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007711 /* unmapped character */
7712 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007713 l1 = ch >> 11;
7714 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007715 if (level1[l1] == 0xFF)
7716 level1[l1] = count2++;
7717 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007718 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007719 }
7720
7721 if (count2 >= 0xFF || count3 >= 0xFF)
7722 need_dict = 1;
7723
7724 if (need_dict) {
7725 PyObject *result = PyDict_New();
7726 PyObject *key, *value;
7727 if (!result)
7728 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007729 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007731 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007732 if (!key || !value)
7733 goto failed1;
7734 if (PyDict_SetItem(result, key, value) == -1)
7735 goto failed1;
7736 Py_DECREF(key);
7737 Py_DECREF(value);
7738 }
7739 return result;
7740 failed1:
7741 Py_XDECREF(key);
7742 Py_XDECREF(value);
7743 Py_DECREF(result);
7744 return NULL;
7745 }
7746
7747 /* Create a three-level trie */
7748 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7749 16*count2 + 128*count3 - 1);
7750 if (!result)
7751 return PyErr_NoMemory();
7752 PyObject_Init(result, &EncodingMapType);
7753 mresult = (struct encoding_map*)result;
7754 mresult->count2 = count2;
7755 mresult->count3 = count3;
7756 mlevel1 = mresult->level1;
7757 mlevel2 = mresult->level23;
7758 mlevel3 = mresult->level23 + 16*count2;
7759 memcpy(mlevel1, level1, 32);
7760 memset(mlevel2, 0xFF, 16*count2);
7761 memset(mlevel3, 0, 128*count3);
7762 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007763 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007764 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007765 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7766 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767 /* unmapped character */
7768 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007769 o1 = ch>>11;
7770 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007771 i2 = 16*mlevel1[o1] + o2;
7772 if (mlevel2[i2] == 0xFF)
7773 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007774 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775 i3 = 128*mlevel2[i2] + o3;
7776 mlevel3[i3] = i;
7777 }
7778 return result;
7779}
7780
7781static int
Victor Stinner22168992011-11-20 17:09:18 +01007782encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783{
7784 struct encoding_map *map = (struct encoding_map*)mapping;
7785 int l1 = c>>11;
7786 int l2 = (c>>7) & 0xF;
7787 int l3 = c & 0x7F;
7788 int i;
7789
Victor Stinner22168992011-11-20 17:09:18 +01007790 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007792 if (c == 0)
7793 return 0;
7794 /* level 1*/
7795 i = map->level1[l1];
7796 if (i == 0xFF) {
7797 return -1;
7798 }
7799 /* level 2*/
7800 i = map->level23[16*i+l2];
7801 if (i == 0xFF) {
7802 return -1;
7803 }
7804 /* level 3 */
7805 i = map->level23[16*map->count2 + 128*i + l3];
7806 if (i == 0) {
7807 return -1;
7808 }
7809 return i;
7810}
7811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812/* Lookup the character ch in the mapping. If the character
7813 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007814 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007815static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007816charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817{
Christian Heimes217cfd12007-12-02 14:31:20 +00007818 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007819 PyObject *x;
7820
7821 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823 x = PyObject_GetItem(mapping, w);
7824 Py_DECREF(w);
7825 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7827 /* No mapping found means: mapping is undefined. */
7828 PyErr_Clear();
7829 x = Py_None;
7830 Py_INCREF(x);
7831 return x;
7832 } else
7833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007835 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007837 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 long value = PyLong_AS_LONG(x);
7839 if (value < 0 || value > 255) {
7840 PyErr_SetString(PyExc_TypeError,
7841 "character mapping must be in range(256)");
7842 Py_DECREF(x);
7843 return NULL;
7844 }
7845 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007847 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 /* wrong return value */
7851 PyErr_Format(PyExc_TypeError,
7852 "character mapping must return integer, bytes or None, not %.400s",
7853 x->ob_type->tp_name);
7854 Py_DECREF(x);
7855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 }
7857}
7858
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007859static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007860charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007862 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7863 /* exponentially overallocate to minimize reallocations */
7864 if (requiredsize < 2*outsize)
7865 requiredsize = 2*outsize;
7866 if (_PyBytes_Resize(outobj, requiredsize))
7867 return -1;
7868 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007869}
7870
Benjamin Peterson14339b62009-01-31 16:36:08 +00007871typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007873} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007874/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007875 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876 space is available. Return a new reference to the object that
7877 was put in the output buffer, or Py_None, if the mapping was undefined
7878 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007879 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007880static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007881charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007882 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007883{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007884 PyObject *rep;
7885 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007886 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007887
Christian Heimes90aa7642007-12-19 02:45:37 +00007888 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 if (res == -1)
7892 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 if (outsize<requiredsize)
7894 if (charmapencode_resize(outobj, outpos, requiredsize))
7895 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007896 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 outstart[(*outpos)++] = (char)res;
7898 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899 }
7900
7901 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007902 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007904 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_DECREF(rep);
7906 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007907 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 if (PyLong_Check(rep)) {
7909 Py_ssize_t requiredsize = *outpos+1;
7910 if (outsize<requiredsize)
7911 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7912 Py_DECREF(rep);
7913 return enc_EXCEPTION;
7914 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007915 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007917 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 else {
7919 const char *repchars = PyBytes_AS_STRING(rep);
7920 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7921 Py_ssize_t requiredsize = *outpos+repsize;
7922 if (outsize<requiredsize)
7923 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7924 Py_DECREF(rep);
7925 return enc_EXCEPTION;
7926 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007927 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 memcpy(outstart + *outpos, repchars, repsize);
7929 *outpos += repsize;
7930 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007931 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007932 Py_DECREF(rep);
7933 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934}
7935
7936/* handle an error in PyUnicode_EncodeCharmap
7937 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007938static int
7939charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007940 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007942 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007943 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007944{
7945 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007946 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007947 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007948 enum PyUnicode_Kind kind;
7949 void *data;
7950 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007951 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952 Py_ssize_t collstartpos = *inpos;
7953 Py_ssize_t collendpos = *inpos+1;
7954 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007955 char *encoding = "charmap";
7956 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007957 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007958 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007959 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007960
Benjamin Petersonbac79492012-01-14 13:34:47 -05007961 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007962 return -1;
7963 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007964 /* find all unencodable characters */
7965 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007966 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007967 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007968 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007969 val = encoding_map_lookup(ch, mapping);
7970 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 break;
7972 ++collendpos;
7973 continue;
7974 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007975
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007976 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7977 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 if (rep==NULL)
7979 return -1;
7980 else if (rep!=Py_None) {
7981 Py_DECREF(rep);
7982 break;
7983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007984 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986 }
7987 /* cache callback name lookup
7988 * (if not done yet, i.e. it's the first error) */
7989 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 if ((errors==NULL) || (!strcmp(errors, "strict")))
7991 *known_errorHandler = 1;
7992 else if (!strcmp(errors, "replace"))
7993 *known_errorHandler = 2;
7994 else if (!strcmp(errors, "ignore"))
7995 *known_errorHandler = 3;
7996 else if (!strcmp(errors, "xmlcharrefreplace"))
7997 *known_errorHandler = 4;
7998 else
7999 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008000 }
8001 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008002 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008003 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 return -1;
8005 case 2: /* replace */
8006 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 x = charmapencode_output('?', mapping, res, respos);
8008 if (x==enc_EXCEPTION) {
8009 return -1;
8010 }
8011 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008012 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 return -1;
8014 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008015 }
8016 /* fall through */
8017 case 3: /* ignore */
8018 *inpos = collendpos;
8019 break;
8020 case 4: /* xmlcharrefreplace */
8021 /* generate replacement (temporarily (mis)uses p) */
8022 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 char buffer[2+29+1+1];
8024 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008025 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 for (cp = buffer; *cp; ++cp) {
8027 x = charmapencode_output(*cp, mapping, res, respos);
8028 if (x==enc_EXCEPTION)
8029 return -1;
8030 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008031 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 return -1;
8033 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 }
8035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 *inpos = collendpos;
8037 break;
8038 default:
8039 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008040 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008042 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008044 if (PyBytes_Check(repunicode)) {
8045 /* Directly copy bytes result to output. */
8046 Py_ssize_t outsize = PyBytes_Size(*res);
8047 Py_ssize_t requiredsize;
8048 repsize = PyBytes_Size(repunicode);
8049 requiredsize = *respos + repsize;
8050 if (requiredsize > outsize)
8051 /* Make room for all additional bytes. */
8052 if (charmapencode_resize(res, respos, requiredsize)) {
8053 Py_DECREF(repunicode);
8054 return -1;
8055 }
8056 memcpy(PyBytes_AsString(*res) + *respos,
8057 PyBytes_AsString(repunicode), repsize);
8058 *respos += repsize;
8059 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008060 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008061 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008062 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008063 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008064 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008065 Py_DECREF(repunicode);
8066 return -1;
8067 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008068 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008069 data = PyUnicode_DATA(repunicode);
8070 kind = PyUnicode_KIND(repunicode);
8071 for (index = 0; index < repsize; index++) {
8072 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8073 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008075 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 return -1;
8077 }
8078 else if (x==enc_FAILED) {
8079 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008080 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 return -1;
8082 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 }
8084 *inpos = newpos;
8085 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086 }
8087 return 0;
8088}
8089
Alexander Belopolsky40018472011-02-26 01:02:56 +00008090PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091_PyUnicode_EncodeCharmap(PyObject *unicode,
8092 PyObject *mapping,
8093 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095 /* output object */
8096 PyObject *res = NULL;
8097 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008098 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008099 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008101 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 PyObject *errorHandler = NULL;
8103 PyObject *exc = NULL;
8104 /* the following variable is used for caching string comparisons
8105 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8106 * 3=ignore, 4=xmlcharrefreplace */
8107 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008108 void *data;
8109 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110
Benjamin Petersonbac79492012-01-14 13:34:47 -05008111 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008112 return NULL;
8113 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008114 data = PyUnicode_DATA(unicode);
8115 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008116
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 /* Default to Latin-1 */
8118 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008119 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 /* allocate enough for a simple encoding without
8122 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008123 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008124 if (res == NULL)
8125 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008126 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008130 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008132 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 if (x==enc_EXCEPTION) /* error */
8134 goto onError;
8135 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008136 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 &exc,
8138 &known_errorHandler, &errorHandler, errors,
8139 &res, &respos)) {
8140 goto onError;
8141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 else
8144 /* done with this character => adjust input position */
8145 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008149 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008150 if (_PyBytes_Resize(&res, respos) < 0)
8151 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008152
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153 Py_XDECREF(exc);
8154 Py_XDECREF(errorHandler);
8155 return res;
8156
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 Py_XDECREF(res);
8159 Py_XDECREF(exc);
8160 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 return NULL;
8162}
8163
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008164/* Deprecated */
8165PyObject *
8166PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8167 Py_ssize_t size,
8168 PyObject *mapping,
8169 const char *errors)
8170{
8171 PyObject *result;
8172 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8173 if (unicode == NULL)
8174 return NULL;
8175 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8176 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008177 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008178}
8179
Alexander Belopolsky40018472011-02-26 01:02:56 +00008180PyObject *
8181PyUnicode_AsCharmapString(PyObject *unicode,
8182 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183{
8184 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 PyErr_BadArgument();
8186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008188 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189}
8190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008192static void
8193make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008194 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008195 Py_ssize_t startpos, Py_ssize_t endpos,
8196 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199 *exceptionObject = _PyUnicodeTranslateError_Create(
8200 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 }
8202 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8204 goto onError;
8205 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8206 goto onError;
8207 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8208 goto onError;
8209 return;
8210 onError:
8211 Py_DECREF(*exceptionObject);
8212 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 }
8214}
8215
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216/* error handling callback helper:
8217 build arguments, call the callback and check the arguments,
8218 put the result into newpos and return the replacement string, which
8219 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008220static PyObject *
8221unicode_translate_call_errorhandler(const char *errors,
8222 PyObject **errorHandler,
8223 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008224 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008225 Py_ssize_t startpos, Py_ssize_t endpos,
8226 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008228 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008230 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231 PyObject *restuple;
8232 PyObject *resunicode;
8233
8234 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238 }
8239
8240 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244
8245 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008250 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 Py_DECREF(restuple);
8252 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 }
8254 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 &resunicode, &i_newpos)) {
8256 Py_DECREF(restuple);
8257 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008259 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008261 else
8262 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8265 Py_DECREF(restuple);
8266 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 Py_INCREF(resunicode);
8269 Py_DECREF(restuple);
8270 return resunicode;
8271}
8272
8273/* Lookup the character ch in the mapping and put the result in result,
8274 which must be decrefed by the caller.
8275 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008276static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278{
Christian Heimes217cfd12007-12-02 14:31:20 +00008279 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 PyObject *x;
8281
8282 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284 x = PyObject_GetItem(mapping, w);
8285 Py_DECREF(w);
8286 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8288 /* No mapping found means: use 1:1 mapping. */
8289 PyErr_Clear();
8290 *result = NULL;
8291 return 0;
8292 } else
8293 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 }
8295 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 *result = x;
8297 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008299 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 long value = PyLong_AS_LONG(x);
8301 long max = PyUnicode_GetMax();
8302 if (value < 0 || value > max) {
8303 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008304 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 Py_DECREF(x);
8306 return -1;
8307 }
8308 *result = x;
8309 return 0;
8310 }
8311 else if (PyUnicode_Check(x)) {
8312 *result = x;
8313 return 0;
8314 }
8315 else {
8316 /* wrong return value */
8317 PyErr_SetString(PyExc_TypeError,
8318 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008319 Py_DECREF(x);
8320 return -1;
8321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322}
8323/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 if not reallocate and adjust various state variables.
8325 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008331 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008332 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 /* exponentially overallocate to minimize reallocations */
8334 if (requiredsize < 2 * oldsize)
8335 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008336 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8337 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008339 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 }
8342 return 0;
8343}
8344/* lookup the character, put the result in the output string and adjust
8345 various state variables. Return a new reference to the object that
8346 was put in the output buffer in *result, or Py_None, if the mapping was
8347 undefined (in which case no character was written).
8348 The called must decref result.
8349 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008350static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8352 PyObject *mapping, Py_UCS4 **output,
8353 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8357 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 }
8363 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008365 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 }
8369 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 Py_ssize_t repsize;
8371 if (PyUnicode_READY(*res) == -1)
8372 return -1;
8373 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 if (repsize==1) {
8375 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 }
8378 else if (repsize!=0) {
8379 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 Py_ssize_t requiredsize = *opos +
8381 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 Py_ssize_t i;
8384 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 for(i = 0; i < repsize; i++)
8387 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 }
8390 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 return 0;
8393}
8394
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396_PyUnicode_TranslateCharmap(PyObject *input,
8397 PyObject *mapping,
8398 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 /* input object */
8401 char *idata;
8402 Py_ssize_t size, i;
8403 int kind;
8404 /* output buffer */
8405 Py_UCS4 *output = NULL;
8406 Py_ssize_t osize;
8407 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 char *reason = "character maps to <undefined>";
8411 PyObject *errorHandler = NULL;
8412 PyObject *exc = NULL;
8413 /* the following variable is used for caching string comparisons
8414 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8415 * 3=ignore, 4=xmlcharrefreplace */
8416 int known_errorHandler = -1;
8417
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 PyErr_BadArgument();
8420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423 if (PyUnicode_READY(input) == -1)
8424 return NULL;
8425 idata = (char*)PyUnicode_DATA(input);
8426 kind = PyUnicode_KIND(input);
8427 size = PyUnicode_GET_LENGTH(input);
8428 i = 0;
8429
8430 if (size == 0) {
8431 Py_INCREF(input);
8432 return input;
8433 }
8434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 /* allocate enough for a simple 1:1 translation without
8436 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 osize = size;
8438 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8439 opos = 0;
8440 if (output == NULL) {
8441 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008445 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 /* try to encode it */
8447 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 if (charmaptranslate_output(input, i, mapping,
8449 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 Py_XDECREF(x);
8451 goto onError;
8452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 else { /* untranslatable character */
8457 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8458 Py_ssize_t repsize;
8459 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 Py_ssize_t collstart = i;
8463 Py_ssize_t collend = i+1;
8464 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 while (collend < size) {
8468 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 goto onError;
8470 Py_XDECREF(x);
8471 if (x!=Py_None)
8472 break;
8473 ++collend;
8474 }
8475 /* cache callback name lookup
8476 * (if not done yet, i.e. it's the first error) */
8477 if (known_errorHandler==-1) {
8478 if ((errors==NULL) || (!strcmp(errors, "strict")))
8479 known_errorHandler = 1;
8480 else if (!strcmp(errors, "replace"))
8481 known_errorHandler = 2;
8482 else if (!strcmp(errors, "ignore"))
8483 known_errorHandler = 3;
8484 else if (!strcmp(errors, "xmlcharrefreplace"))
8485 known_errorHandler = 4;
8486 else
8487 known_errorHandler = 0;
8488 }
8489 switch (known_errorHandler) {
8490 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008491 make_translate_exception(&exc,
8492 input, collstart, collend, reason);
8493 if (exc != NULL)
8494 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008495 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 case 2: /* replace */
8497 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 for (coll = collstart; coll<collend; coll++)
8499 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 /* fall through */
8501 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 break;
8504 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 /* generate replacement (temporarily (mis)uses i) */
8506 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 char buffer[2+29+1+1];
8508 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8510 if (charmaptranslate_makespace(&output, &osize,
8511 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 goto onError;
8513 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 break;
8518 default:
8519 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 reason, input, &exc,
8521 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008522 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008524 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008525 Py_DECREF(repunicode);
8526 goto onError;
8527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 repsize = PyUnicode_GET_LENGTH(repunicode);
8530 if (charmaptranslate_makespace(&output, &osize,
8531 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 Py_DECREF(repunicode);
8533 goto onError;
8534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 for (uni2 = 0; repsize-->0; ++uni2)
8536 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8537 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008539 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008540 }
8541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8543 if (!res)
8544 goto onError;
8545 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 Py_XDECREF(exc);
8547 Py_XDECREF(errorHandler);
8548 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 Py_XDECREF(exc);
8553 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 return NULL;
8555}
8556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557/* Deprecated. Use PyUnicode_Translate instead. */
8558PyObject *
8559PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8560 Py_ssize_t size,
8561 PyObject *mapping,
8562 const char *errors)
8563{
Christian Heimes5f520f42012-09-11 14:03:25 +02008564 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8566 if (!unicode)
8567 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008568 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8569 Py_DECREF(unicode);
8570 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571}
8572
Alexander Belopolsky40018472011-02-26 01:02:56 +00008573PyObject *
8574PyUnicode_Translate(PyObject *str,
8575 PyObject *mapping,
8576 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577{
8578 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008579
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 str = PyUnicode_FromObject(str);
8581 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008582 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 Py_DECREF(str);
8585 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586}
Tim Petersced69f82003-09-16 20:30:58 +00008587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008589fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590{
8591 /* No need to call PyUnicode_READY(self) because this function is only
8592 called as a callback from fixup() which does it already. */
8593 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8594 const int kind = PyUnicode_KIND(self);
8595 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008596 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008597 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 Py_ssize_t i;
8599
8600 for (i = 0; i < len; ++i) {
8601 ch = PyUnicode_READ(kind, data, i);
8602 fixed = 0;
8603 if (ch > 127) {
8604 if (Py_UNICODE_ISSPACE(ch))
8605 fixed = ' ';
8606 else {
8607 const int decimal = Py_UNICODE_TODECIMAL(ch);
8608 if (decimal >= 0)
8609 fixed = '0' + decimal;
8610 }
8611 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008612 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008613 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 PyUnicode_WRITE(kind, data, i, fixed);
8615 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008616 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008617 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 }
8620
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008621 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622}
8623
8624PyObject *
8625_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8626{
8627 if (!PyUnicode_Check(unicode)) {
8628 PyErr_BadInternalCall();
8629 return NULL;
8630 }
8631 if (PyUnicode_READY(unicode) == -1)
8632 return NULL;
8633 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8634 /* If the string is already ASCII, just return the same string */
8635 Py_INCREF(unicode);
8636 return unicode;
8637 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008638 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639}
8640
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008641PyObject *
8642PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8643 Py_ssize_t length)
8644{
Victor Stinnerf0124502011-11-21 23:12:56 +01008645 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008646 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008647 Py_UCS4 maxchar;
8648 enum PyUnicode_Kind kind;
8649 void *data;
8650
Victor Stinner99d7ad02012-02-22 13:37:39 +01008651 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008652 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008653 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008654 if (ch > 127) {
8655 int decimal = Py_UNICODE_TODECIMAL(ch);
8656 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008657 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008658 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008659 }
8660 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008661
8662 /* Copy to a new string */
8663 decimal = PyUnicode_New(length, maxchar);
8664 if (decimal == NULL)
8665 return decimal;
8666 kind = PyUnicode_KIND(decimal);
8667 data = PyUnicode_DATA(decimal);
8668 /* Iterate over code points */
8669 for (i = 0; i < length; i++) {
8670 Py_UNICODE ch = s[i];
8671 if (ch > 127) {
8672 int decimal = Py_UNICODE_TODECIMAL(ch);
8673 if (decimal >= 0)
8674 ch = '0' + decimal;
8675 }
8676 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008678 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008679}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008680/* --- Decimal Encoder ---------------------------------------------------- */
8681
Alexander Belopolsky40018472011-02-26 01:02:56 +00008682int
8683PyUnicode_EncodeDecimal(Py_UNICODE *s,
8684 Py_ssize_t length,
8685 char *output,
8686 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008687{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008688 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008689 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008690 enum PyUnicode_Kind kind;
8691 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008692
8693 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 PyErr_BadArgument();
8695 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008696 }
8697
Victor Stinner42bf7752011-11-21 22:52:58 +01008698 unicode = PyUnicode_FromUnicode(s, length);
8699 if (unicode == NULL)
8700 return -1;
8701
Benjamin Petersonbac79492012-01-14 13:34:47 -05008702 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008703 Py_DECREF(unicode);
8704 return -1;
8705 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008706 kind = PyUnicode_KIND(unicode);
8707 data = PyUnicode_DATA(unicode);
8708
Victor Stinnerb84d7232011-11-22 01:50:07 +01008709 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008710 PyObject *exc;
8711 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008713 Py_ssize_t startpos;
8714
8715 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008716
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008718 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008719 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008721 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 decimal = Py_UNICODE_TODECIMAL(ch);
8723 if (decimal >= 0) {
8724 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008725 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 continue;
8727 }
8728 if (0 < ch && ch < 256) {
8729 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008730 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 continue;
8732 }
Victor Stinner6345be92011-11-25 20:09:01 +01008733
Victor Stinner42bf7752011-11-21 22:52:58 +01008734 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008735 exc = NULL;
8736 raise_encode_exception(&exc, "decimal", unicode,
8737 startpos, startpos+1,
8738 "invalid decimal Unicode string");
8739 Py_XDECREF(exc);
8740 Py_DECREF(unicode);
8741 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008742 }
8743 /* 0-terminate the output string */
8744 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008745 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008746 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008747}
8748
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749/* --- Helpers ------------------------------------------------------------ */
8750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008752any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 Py_ssize_t start,
8754 Py_ssize_t end)
8755{
8756 int kind1, kind2, kind;
8757 void *buf1, *buf2;
8758 Py_ssize_t len1, len2, result;
8759
8760 kind1 = PyUnicode_KIND(s1);
8761 kind2 = PyUnicode_KIND(s2);
8762 kind = kind1 > kind2 ? kind1 : kind2;
8763 buf1 = PyUnicode_DATA(s1);
8764 buf2 = PyUnicode_DATA(s2);
8765 if (kind1 != kind)
8766 buf1 = _PyUnicode_AsKind(s1, kind);
8767 if (!buf1)
8768 return -2;
8769 if (kind2 != kind)
8770 buf2 = _PyUnicode_AsKind(s2, kind);
8771 if (!buf2) {
8772 if (kind1 != kind) PyMem_Free(buf1);
8773 return -2;
8774 }
8775 len1 = PyUnicode_GET_LENGTH(s1);
8776 len2 = PyUnicode_GET_LENGTH(s2);
8777
Victor Stinner794d5672011-10-10 03:21:36 +02008778 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008779 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008780 case PyUnicode_1BYTE_KIND:
8781 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8782 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8783 else
8784 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8785 break;
8786 case PyUnicode_2BYTE_KIND:
8787 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8788 break;
8789 case PyUnicode_4BYTE_KIND:
8790 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8791 break;
8792 default:
8793 assert(0); result = -2;
8794 }
8795 }
8796 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008797 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008798 case PyUnicode_1BYTE_KIND:
8799 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8800 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8801 else
8802 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8803 break;
8804 case PyUnicode_2BYTE_KIND:
8805 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8806 break;
8807 case PyUnicode_4BYTE_KIND:
8808 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8809 break;
8810 default:
8811 assert(0); result = -2;
8812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 }
8814
8815 if (kind1 != kind)
8816 PyMem_Free(buf1);
8817 if (kind2 != kind)
8818 PyMem_Free(buf2);
8819
8820 return result;
8821}
8822
8823Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008824_PyUnicode_InsertThousandsGrouping(
8825 PyObject *unicode, Py_ssize_t index,
8826 Py_ssize_t n_buffer,
8827 void *digits, Py_ssize_t n_digits,
8828 Py_ssize_t min_width,
8829 const char *grouping, PyObject *thousands_sep,
8830 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831{
Victor Stinner41a863c2012-02-24 00:37:51 +01008832 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008833 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008834 Py_ssize_t thousands_sep_len;
8835 Py_ssize_t len;
8836
8837 if (unicode != NULL) {
8838 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008839 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008840 }
8841 else {
8842 kind = PyUnicode_1BYTE_KIND;
8843 data = NULL;
8844 }
8845 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8846 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8847 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8848 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008849 if (thousands_sep_kind < kind) {
8850 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8851 if (!thousands_sep_data)
8852 return -1;
8853 }
8854 else {
8855 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8856 if (!data)
8857 return -1;
8858 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008859 }
8860
Benjamin Petersonead6b532011-12-20 17:23:42 -06008861 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008863 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008864 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008865 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008866 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008867 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008868 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008869 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008870 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008871 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008872 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008877 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008878 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008879 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008881 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008882 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008883 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008884 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008885 break;
8886 default:
8887 assert(0);
8888 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008890 if (unicode != NULL && thousands_sep_kind != kind) {
8891 if (thousands_sep_kind < kind)
8892 PyMem_Free(thousands_sep_data);
8893 else
8894 PyMem_Free(data);
8895 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008896 if (unicode == NULL) {
8897 *maxchar = 127;
8898 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008899 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008900 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008901 }
8902 }
8903 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904}
8905
8906
Thomas Wouters477c8d52006-05-27 19:21:47 +00008907/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008908#define ADJUST_INDICES(start, end, len) \
8909 if (end > len) \
8910 end = len; \
8911 else if (end < 0) { \
8912 end += len; \
8913 if (end < 0) \
8914 end = 0; \
8915 } \
8916 if (start < 0) { \
8917 start += len; \
8918 if (start < 0) \
8919 start = 0; \
8920 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008921
Alexander Belopolsky40018472011-02-26 01:02:56 +00008922Py_ssize_t
8923PyUnicode_Count(PyObject *str,
8924 PyObject *substr,
8925 Py_ssize_t start,
8926 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008928 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008929 PyObject* str_obj;
8930 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 int kind1, kind2, kind;
8932 void *buf1 = NULL, *buf2 = NULL;
8933 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008934
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008935 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008936 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008938 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008939 if (!sub_obj) {
8940 Py_DECREF(str_obj);
8941 return -1;
8942 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008943 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008944 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 Py_DECREF(str_obj);
8946 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 }
Tim Petersced69f82003-09-16 20:30:58 +00008948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 kind1 = PyUnicode_KIND(str_obj);
8950 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008951 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008954 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008955 if (kind2 > kind) {
8956 Py_DECREF(sub_obj);
8957 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008958 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008959 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008960 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 if (!buf2)
8963 goto onError;
8964 len1 = PyUnicode_GET_LENGTH(str_obj);
8965 len2 = PyUnicode_GET_LENGTH(sub_obj);
8966
8967 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008968 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008970 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8971 result = asciilib_count(
8972 ((Py_UCS1*)buf1) + start, end - start,
8973 buf2, len2, PY_SSIZE_T_MAX
8974 );
8975 else
8976 result = ucs1lib_count(
8977 ((Py_UCS1*)buf1) + start, end - start,
8978 buf2, len2, PY_SSIZE_T_MAX
8979 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 break;
8981 case PyUnicode_2BYTE_KIND:
8982 result = ucs2lib_count(
8983 ((Py_UCS2*)buf1) + start, end - start,
8984 buf2, len2, PY_SSIZE_T_MAX
8985 );
8986 break;
8987 case PyUnicode_4BYTE_KIND:
8988 result = ucs4lib_count(
8989 ((Py_UCS4*)buf1) + start, end - start,
8990 buf2, len2, PY_SSIZE_T_MAX
8991 );
8992 break;
8993 default:
8994 assert(0); result = 0;
8995 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008996
8997 Py_DECREF(sub_obj);
8998 Py_DECREF(str_obj);
8999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 if (kind2 != kind)
9001 PyMem_Free(buf2);
9002
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 onError:
9005 Py_DECREF(sub_obj);
9006 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 if (kind2 != kind && buf2)
9008 PyMem_Free(buf2);
9009 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010}
9011
Alexander Belopolsky40018472011-02-26 01:02:56 +00009012Py_ssize_t
9013PyUnicode_Find(PyObject *str,
9014 PyObject *sub,
9015 Py_ssize_t start,
9016 Py_ssize_t end,
9017 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009019 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009020
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009022 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009024 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009025 if (!sub) {
9026 Py_DECREF(str);
9027 return -2;
9028 }
9029 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9030 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 Py_DECREF(str);
9032 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 }
Tim Petersced69f82003-09-16 20:30:58 +00009034
Victor Stinner794d5672011-10-10 03:21:36 +02009035 result = any_find_slice(direction,
9036 str, sub, start, end
9037 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009038
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009040 Py_DECREF(sub);
9041
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 return result;
9043}
9044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045Py_ssize_t
9046PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9047 Py_ssize_t start, Py_ssize_t end,
9048 int direction)
9049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009051 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (PyUnicode_READY(str) == -1)
9053 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009054 if (start < 0 || end < 0) {
9055 PyErr_SetString(PyExc_IndexError, "string index out of range");
9056 return -2;
9057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 if (end > PyUnicode_GET_LENGTH(str))
9059 end = PyUnicode_GET_LENGTH(str);
9060 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009061 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9062 kind, end-start, ch, direction);
9063 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009065 else
9066 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067}
9068
Alexander Belopolsky40018472011-02-26 01:02:56 +00009069static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009070tailmatch(PyObject *self,
9071 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009072 Py_ssize_t start,
9073 Py_ssize_t end,
9074 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 int kind_self;
9077 int kind_sub;
9078 void *data_self;
9079 void *data_sub;
9080 Py_ssize_t offset;
9081 Py_ssize_t i;
9082 Py_ssize_t end_sub;
9083
9084 if (PyUnicode_READY(self) == -1 ||
9085 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009086 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087
9088 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 return 1;
9090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9092 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 kind_self = PyUnicode_KIND(self);
9097 data_self = PyUnicode_DATA(self);
9098 kind_sub = PyUnicode_KIND(substring);
9099 data_sub = PyUnicode_DATA(substring);
9100 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9101
9102 if (direction > 0)
9103 offset = end;
9104 else
9105 offset = start;
9106
9107 if (PyUnicode_READ(kind_self, data_self, offset) ==
9108 PyUnicode_READ(kind_sub, data_sub, 0) &&
9109 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9110 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9111 /* If both are of the same kind, memcmp is sufficient */
9112 if (kind_self == kind_sub) {
9113 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009114 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 data_sub,
9116 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009117 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 }
9119 /* otherwise we have to compare each character by first accesing it */
9120 else {
9121 /* We do not need to compare 0 and len(substring)-1 because
9122 the if statement above ensured already that they are equal
9123 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 for (i = 1; i < end_sub; ++i) {
9125 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9126 PyUnicode_READ(kind_sub, data_sub, i))
9127 return 0;
9128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131 }
9132
9133 return 0;
9134}
9135
Alexander Belopolsky40018472011-02-26 01:02:56 +00009136Py_ssize_t
9137PyUnicode_Tailmatch(PyObject *str,
9138 PyObject *substr,
9139 Py_ssize_t start,
9140 Py_ssize_t end,
9141 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009143 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009144
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 str = PyUnicode_FromObject(str);
9146 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148 substr = PyUnicode_FromObject(substr);
9149 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 Py_DECREF(str);
9151 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 }
Tim Petersced69f82003-09-16 20:30:58 +00009153
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009154 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156 Py_DECREF(str);
9157 Py_DECREF(substr);
9158 return result;
9159}
9160
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161/* Apply fixfct filter to the Unicode object self and return a
9162 reference to the modified object */
9163
Alexander Belopolsky40018472011-02-26 01:02:56 +00009164static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009165fixup(PyObject *self,
9166 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 PyObject *u;
9169 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009170 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009172 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009175 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 /* fix functions return the new maximum character in a string,
9178 if the kind of the resulting unicode object does not change,
9179 everything is fine. Otherwise we need to change the string kind
9180 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009181 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009182
9183 if (maxchar_new == 0) {
9184 /* no changes */;
9185 if (PyUnicode_CheckExact(self)) {
9186 Py_DECREF(u);
9187 Py_INCREF(self);
9188 return self;
9189 }
9190 else
9191 return u;
9192 }
9193
Victor Stinnere6abb482012-05-02 01:15:40 +02009194 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195
Victor Stinnereaab6042011-12-11 22:22:39 +01009196 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009198
9199 /* In case the maximum character changed, we need to
9200 convert the string to the new category. */
9201 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9202 if (v == NULL) {
9203 Py_DECREF(u);
9204 return NULL;
9205 }
9206 if (maxchar_new > maxchar_old) {
9207 /* If the maxchar increased so that the kind changed, not all
9208 characters are representable anymore and we need to fix the
9209 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009210 _PyUnicode_FastCopyCharacters(v, 0,
9211 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009212 maxchar_old = fixfct(v);
9213 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 }
9215 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009216 _PyUnicode_FastCopyCharacters(v, 0,
9217 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009219 Py_DECREF(u);
9220 assert(_PyUnicode_CheckConsistency(v, 1));
9221 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222}
9223
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009224static PyObject *
9225ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009227 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9228 char *resdata, *data = PyUnicode_DATA(self);
9229 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009230
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009231 res = PyUnicode_New(len, 127);
9232 if (res == NULL)
9233 return NULL;
9234 resdata = PyUnicode_DATA(res);
9235 if (lower)
9236 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009238 _Py_bytes_upper(resdata, data, len);
9239 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240}
9241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009243handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009245 Py_ssize_t j;
9246 int final_sigma;
9247 Py_UCS4 c;
9248 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009249
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009250 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9251
9252 where ! is a negation and \p{xxx} is a character with property xxx.
9253 */
9254 for (j = i - 1; j >= 0; j--) {
9255 c = PyUnicode_READ(kind, data, j);
9256 if (!_PyUnicode_IsCaseIgnorable(c))
9257 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009259 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9260 if (final_sigma) {
9261 for (j = i + 1; j < length; j++) {
9262 c = PyUnicode_READ(kind, data, j);
9263 if (!_PyUnicode_IsCaseIgnorable(c))
9264 break;
9265 }
9266 final_sigma = j == length || !_PyUnicode_IsCased(c);
9267 }
9268 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269}
9270
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009271static int
9272lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9273 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009275 /* Obscure special case. */
9276 if (c == 0x3A3) {
9277 mapped[0] = handle_capital_sigma(kind, data, length, i);
9278 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009280 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281}
9282
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009283static Py_ssize_t
9284do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009286 Py_ssize_t i, k = 0;
9287 int n_res, j;
9288 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009289
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009290 c = PyUnicode_READ(kind, data, 0);
9291 n_res = _PyUnicode_ToUpperFull(c, mapped);
9292 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009293 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009294 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296 for (i = 1; i < length; i++) {
9297 c = PyUnicode_READ(kind, data, i);
9298 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9299 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009300 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009301 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009302 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009303 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009304 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305}
9306
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009307static Py_ssize_t
9308do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9309 Py_ssize_t i, k = 0;
9310
9311 for (i = 0; i < length; i++) {
9312 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9313 int n_res, j;
9314 if (Py_UNICODE_ISUPPER(c)) {
9315 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9316 }
9317 else if (Py_UNICODE_ISLOWER(c)) {
9318 n_res = _PyUnicode_ToUpperFull(c, mapped);
9319 }
9320 else {
9321 n_res = 1;
9322 mapped[0] = c;
9323 }
9324 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009325 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009326 res[k++] = mapped[j];
9327 }
9328 }
9329 return k;
9330}
9331
9332static Py_ssize_t
9333do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9334 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009336 Py_ssize_t i, k = 0;
9337
9338 for (i = 0; i < length; i++) {
9339 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9340 int n_res, j;
9341 if (lower)
9342 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9343 else
9344 n_res = _PyUnicode_ToUpperFull(c, mapped);
9345 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009346 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009347 res[k++] = mapped[j];
9348 }
9349 }
9350 return k;
9351}
9352
9353static Py_ssize_t
9354do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9355{
9356 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9357}
9358
9359static Py_ssize_t
9360do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9361{
9362 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9363}
9364
Benjamin Petersone51757f2012-01-12 21:10:29 -05009365static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009366do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9367{
9368 Py_ssize_t i, k = 0;
9369
9370 for (i = 0; i < length; i++) {
9371 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9372 Py_UCS4 mapped[3];
9373 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9374 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009375 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009376 res[k++] = mapped[j];
9377 }
9378 }
9379 return k;
9380}
9381
9382static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009383do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9384{
9385 Py_ssize_t i, k = 0;
9386 int previous_is_cased;
9387
9388 previous_is_cased = 0;
9389 for (i = 0; i < length; i++) {
9390 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9391 Py_UCS4 mapped[3];
9392 int n_res, j;
9393
9394 if (previous_is_cased)
9395 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9396 else
9397 n_res = _PyUnicode_ToTitleFull(c, mapped);
9398
9399 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009400 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009401 res[k++] = mapped[j];
9402 }
9403
9404 previous_is_cased = _PyUnicode_IsCased(c);
9405 }
9406 return k;
9407}
9408
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009409static PyObject *
9410case_operation(PyObject *self,
9411 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9412{
9413 PyObject *res = NULL;
9414 Py_ssize_t length, newlength = 0;
9415 int kind, outkind;
9416 void *data, *outdata;
9417 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9418
Benjamin Petersoneea48462012-01-16 14:28:50 -05009419 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009420
9421 kind = PyUnicode_KIND(self);
9422 data = PyUnicode_DATA(self);
9423 length = PyUnicode_GET_LENGTH(self);
9424 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9425 if (tmp == NULL)
9426 return PyErr_NoMemory();
9427 newlength = perform(kind, data, length, tmp, &maxchar);
9428 res = PyUnicode_New(newlength, maxchar);
9429 if (res == NULL)
9430 goto leave;
9431 tmpend = tmp + newlength;
9432 outdata = PyUnicode_DATA(res);
9433 outkind = PyUnicode_KIND(res);
9434 switch (outkind) {
9435 case PyUnicode_1BYTE_KIND:
9436 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9437 break;
9438 case PyUnicode_2BYTE_KIND:
9439 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9440 break;
9441 case PyUnicode_4BYTE_KIND:
9442 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9443 break;
9444 default:
9445 assert(0);
9446 break;
9447 }
9448 leave:
9449 PyMem_FREE(tmp);
9450 return res;
9451}
9452
Tim Peters8ce9f162004-08-27 01:49:32 +00009453PyObject *
9454PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009457 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009459 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009460 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9461 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009462 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009464 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009466 int use_memcpy;
9467 unsigned char *res_data = NULL, *sep_data = NULL;
9468 PyObject *last_obj;
9469 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470
Tim Peters05eba1f2004-08-27 21:32:02 +00009471 fseq = PySequence_Fast(seq, "");
9472 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009473 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009474 }
9475
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009476 /* NOTE: the following code can't call back into Python code,
9477 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009478 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009479
Tim Peters05eba1f2004-08-27 21:32:02 +00009480 seqlen = PySequence_Fast_GET_SIZE(fseq);
9481 /* If empty sequence, return u"". */
9482 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009483 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009484 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009485 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009486
Tim Peters05eba1f2004-08-27 21:32:02 +00009487 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009488 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009489 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009490 if (seqlen == 1) {
9491 if (PyUnicode_CheckExact(items[0])) {
9492 res = items[0];
9493 Py_INCREF(res);
9494 Py_DECREF(fseq);
9495 return res;
9496 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009497 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009498 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009499 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009500 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009501 /* Set up sep and seplen */
9502 if (separator == NULL) {
9503 /* fall back to a blank space separator */
9504 sep = PyUnicode_FromOrdinal(' ');
9505 if (!sep)
9506 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009507 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009508 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009509 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009510 else {
9511 if (!PyUnicode_Check(separator)) {
9512 PyErr_Format(PyExc_TypeError,
9513 "separator: expected str instance,"
9514 " %.80s found",
9515 Py_TYPE(separator)->tp_name);
9516 goto onError;
9517 }
9518 if (PyUnicode_READY(separator))
9519 goto onError;
9520 sep = separator;
9521 seplen = PyUnicode_GET_LENGTH(separator);
9522 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9523 /* inc refcount to keep this code path symmetric with the
9524 above case of a blank separator */
9525 Py_INCREF(sep);
9526 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009527 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009528 }
9529
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009530 /* There are at least two things to join, or else we have a subclass
9531 * of str in the sequence.
9532 * Do a pre-pass to figure out the total amount of space we'll
9533 * need (sz), and see whether all argument are strings.
9534 */
9535 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009536#ifdef Py_DEBUG
9537 use_memcpy = 0;
9538#else
9539 use_memcpy = 1;
9540#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009541 for (i = 0; i < seqlen; i++) {
9542 const Py_ssize_t old_sz = sz;
9543 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 if (!PyUnicode_Check(item)) {
9545 PyErr_Format(PyExc_TypeError,
9546 "sequence item %zd: expected str instance,"
9547 " %.80s found",
9548 i, Py_TYPE(item)->tp_name);
9549 goto onError;
9550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 if (PyUnicode_READY(item) == -1)
9552 goto onError;
9553 sz += PyUnicode_GET_LENGTH(item);
9554 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009555 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009556 if (i != 0)
9557 sz += seplen;
9558 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9559 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009560 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009561 goto onError;
9562 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009563 if (use_memcpy && last_obj != NULL) {
9564 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9565 use_memcpy = 0;
9566 }
9567 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009568 }
Tim Petersced69f82003-09-16 20:30:58 +00009569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009571 if (res == NULL)
9572 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009573
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009574 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009575#ifdef Py_DEBUG
9576 use_memcpy = 0;
9577#else
9578 if (use_memcpy) {
9579 res_data = PyUnicode_1BYTE_DATA(res);
9580 kind = PyUnicode_KIND(res);
9581 if (seplen != 0)
9582 sep_data = PyUnicode_1BYTE_DATA(sep);
9583 }
9584#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009585 if (use_memcpy) {
9586 for (i = 0; i < seqlen; ++i) {
9587 Py_ssize_t itemlen;
9588 item = items[i];
9589
9590 /* Copy item, and maybe the separator. */
9591 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009592 Py_MEMCPY(res_data,
9593 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009594 kind * seplen);
9595 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009596 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009597
9598 itemlen = PyUnicode_GET_LENGTH(item);
9599 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009600 Py_MEMCPY(res_data,
9601 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009602 kind * itemlen);
9603 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009604 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009605 }
9606 assert(res_data == PyUnicode_1BYTE_DATA(res)
9607 + kind * PyUnicode_GET_LENGTH(res));
9608 }
9609 else {
9610 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9611 Py_ssize_t itemlen;
9612 item = items[i];
9613
9614 /* Copy item, and maybe the separator. */
9615 if (i && seplen != 0) {
9616 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9617 res_offset += seplen;
9618 }
9619
9620 itemlen = PyUnicode_GET_LENGTH(item);
9621 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009622 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009623 res_offset += itemlen;
9624 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009625 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009626 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009627 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009628
Tim Peters05eba1f2004-08-27 21:32:02 +00009629 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009631 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633
Benjamin Peterson29060642009-01-31 22:14:21 +00009634 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009635 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009637 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 return NULL;
9639}
9640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641#define FILL(kind, data, value, start, length) \
9642 do { \
9643 Py_ssize_t i_ = 0; \
9644 assert(kind != PyUnicode_WCHAR_KIND); \
9645 switch ((kind)) { \
9646 case PyUnicode_1BYTE_KIND: { \
9647 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009648 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 break; \
9650 } \
9651 case PyUnicode_2BYTE_KIND: { \
9652 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9653 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9654 break; \
9655 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009656 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9658 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9659 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009660 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 } \
9662 } \
9663 } while (0)
9664
Victor Stinnerd3f08822012-05-29 12:57:52 +02009665void
9666_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9667 Py_UCS4 fill_char)
9668{
9669 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9670 const void *data = PyUnicode_DATA(unicode);
9671 assert(PyUnicode_IS_READY(unicode));
9672 assert(unicode_modifiable(unicode));
9673 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9674 assert(start >= 0);
9675 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9676 FILL(kind, data, fill_char, start, length);
9677}
9678
Victor Stinner3fe55312012-01-04 00:33:50 +01009679Py_ssize_t
9680PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9681 Py_UCS4 fill_char)
9682{
9683 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009684
9685 if (!PyUnicode_Check(unicode)) {
9686 PyErr_BadInternalCall();
9687 return -1;
9688 }
9689 if (PyUnicode_READY(unicode) == -1)
9690 return -1;
9691 if (unicode_check_modifiable(unicode))
9692 return -1;
9693
Victor Stinnerd3f08822012-05-29 12:57:52 +02009694 if (start < 0) {
9695 PyErr_SetString(PyExc_IndexError, "string index out of range");
9696 return -1;
9697 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009698 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9699 PyErr_SetString(PyExc_ValueError,
9700 "fill character is bigger than "
9701 "the string maximum character");
9702 return -1;
9703 }
9704
9705 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9706 length = Py_MIN(maxlen, length);
9707 if (length <= 0)
9708 return 0;
9709
Victor Stinnerd3f08822012-05-29 12:57:52 +02009710 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009711 return length;
9712}
9713
Victor Stinner9310abb2011-10-05 00:59:23 +02009714static PyObject *
9715pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009716 Py_ssize_t left,
9717 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 PyObject *u;
9721 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009722 int kind;
9723 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724
9725 if (left < 0)
9726 left = 0;
9727 if (right < 0)
9728 right = 0;
9729
Victor Stinnerc4b49542011-12-11 22:44:26 +01009730 if (left == 0 && right == 0)
9731 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9734 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009735 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9736 return NULL;
9737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009739 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009741 if (!u)
9742 return NULL;
9743
9744 kind = PyUnicode_KIND(u);
9745 data = PyUnicode_DATA(u);
9746 if (left)
9747 FILL(kind, data, fill, 0, left);
9748 if (right)
9749 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009750 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009751 assert(_PyUnicode_CheckConsistency(u, 1));
9752 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753}
9754
Alexander Belopolsky40018472011-02-26 01:02:56 +00009755PyObject *
9756PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759
9760 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009761 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009762 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009763 if (PyUnicode_READY(string) == -1) {
9764 Py_DECREF(string);
9765 return NULL;
9766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767
Benjamin Petersonead6b532011-12-20 17:23:42 -06009768 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009770 if (PyUnicode_IS_ASCII(string))
9771 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009772 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009773 PyUnicode_GET_LENGTH(string), keepends);
9774 else
9775 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009776 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009777 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 break;
9779 case PyUnicode_2BYTE_KIND:
9780 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009781 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 PyUnicode_GET_LENGTH(string), keepends);
9783 break;
9784 case PyUnicode_4BYTE_KIND:
9785 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009786 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 PyUnicode_GET_LENGTH(string), keepends);
9788 break;
9789 default:
9790 assert(0);
9791 list = 0;
9792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793 Py_DECREF(string);
9794 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795}
9796
Alexander Belopolsky40018472011-02-26 01:02:56 +00009797static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009798split(PyObject *self,
9799 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009800 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 int kind1, kind2, kind;
9803 void *buf1, *buf2;
9804 Py_ssize_t len1, len2;
9805 PyObject* out;
9806
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009808 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 if (PyUnicode_READY(self) == -1)
9811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009814 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009816 if (PyUnicode_IS_ASCII(self))
9817 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009818 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009819 PyUnicode_GET_LENGTH(self), maxcount
9820 );
9821 else
9822 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009823 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009824 PyUnicode_GET_LENGTH(self), maxcount
9825 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 case PyUnicode_2BYTE_KIND:
9827 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009828 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 PyUnicode_GET_LENGTH(self), maxcount
9830 );
9831 case PyUnicode_4BYTE_KIND:
9832 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 PyUnicode_GET_LENGTH(self), maxcount
9835 );
9836 default:
9837 assert(0);
9838 return NULL;
9839 }
9840
9841 if (PyUnicode_READY(substring) == -1)
9842 return NULL;
9843
9844 kind1 = PyUnicode_KIND(self);
9845 kind2 = PyUnicode_KIND(substring);
9846 kind = kind1 > kind2 ? kind1 : kind2;
9847 buf1 = PyUnicode_DATA(self);
9848 buf2 = PyUnicode_DATA(substring);
9849 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009850 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 if (!buf1)
9852 return NULL;
9853 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009854 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 if (!buf2) {
9856 if (kind1 != kind) PyMem_Free(buf1);
9857 return NULL;
9858 }
9859 len1 = PyUnicode_GET_LENGTH(self);
9860 len2 = PyUnicode_GET_LENGTH(substring);
9861
Benjamin Petersonead6b532011-12-20 17:23:42 -06009862 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009864 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9865 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009866 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009867 else
9868 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009869 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 break;
9871 case PyUnicode_2BYTE_KIND:
9872 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009873 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 break;
9875 case PyUnicode_4BYTE_KIND:
9876 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009877 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 break;
9879 default:
9880 out = NULL;
9881 }
9882 if (kind1 != kind)
9883 PyMem_Free(buf1);
9884 if (kind2 != kind)
9885 PyMem_Free(buf2);
9886 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887}
9888
Alexander Belopolsky40018472011-02-26 01:02:56 +00009889static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009890rsplit(PyObject *self,
9891 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009892 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 int kind1, kind2, kind;
9895 void *buf1, *buf2;
9896 Py_ssize_t len1, len2;
9897 PyObject* out;
9898
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009899 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009900 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 if (PyUnicode_READY(self) == -1)
9903 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009906 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009908 if (PyUnicode_IS_ASCII(self))
9909 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009910 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009911 PyUnicode_GET_LENGTH(self), maxcount
9912 );
9913 else
9914 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009915 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009916 PyUnicode_GET_LENGTH(self), maxcount
9917 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 case PyUnicode_2BYTE_KIND:
9919 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009920 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 PyUnicode_GET_LENGTH(self), maxcount
9922 );
9923 case PyUnicode_4BYTE_KIND:
9924 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 PyUnicode_GET_LENGTH(self), maxcount
9927 );
9928 default:
9929 assert(0);
9930 return NULL;
9931 }
9932
9933 if (PyUnicode_READY(substring) == -1)
9934 return NULL;
9935
9936 kind1 = PyUnicode_KIND(self);
9937 kind2 = PyUnicode_KIND(substring);
9938 kind = kind1 > kind2 ? kind1 : kind2;
9939 buf1 = PyUnicode_DATA(self);
9940 buf2 = PyUnicode_DATA(substring);
9941 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009942 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 if (!buf1)
9944 return NULL;
9945 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009946 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 if (!buf2) {
9948 if (kind1 != kind) PyMem_Free(buf1);
9949 return NULL;
9950 }
9951 len1 = PyUnicode_GET_LENGTH(self);
9952 len2 = PyUnicode_GET_LENGTH(substring);
9953
Benjamin Petersonead6b532011-12-20 17:23:42 -06009954 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009956 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9957 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009958 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009959 else
9960 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009961 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 break;
9963 case PyUnicode_2BYTE_KIND:
9964 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009965 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 break;
9967 case PyUnicode_4BYTE_KIND:
9968 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009969 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 break;
9971 default:
9972 out = NULL;
9973 }
9974 if (kind1 != kind)
9975 PyMem_Free(buf1);
9976 if (kind2 != kind)
9977 PyMem_Free(buf2);
9978 return out;
9979}
9980
9981static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9983 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009985 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009987 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9988 return asciilib_find(buf1, len1, buf2, len2, offset);
9989 else
9990 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 case PyUnicode_2BYTE_KIND:
9992 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9993 case PyUnicode_4BYTE_KIND:
9994 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9995 }
9996 assert(0);
9997 return -1;
9998}
9999
10000static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010001anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10002 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010004 switch (kind) {
10005 case PyUnicode_1BYTE_KIND:
10006 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10007 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10008 else
10009 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10010 case PyUnicode_2BYTE_KIND:
10011 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10012 case PyUnicode_4BYTE_KIND:
10013 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10014 }
10015 assert(0);
10016 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010017}
10018
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010019static void
10020replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10021 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10022{
10023 int kind = PyUnicode_KIND(u);
10024 void *data = PyUnicode_DATA(u);
10025 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10026 if (kind == PyUnicode_1BYTE_KIND) {
10027 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10028 (Py_UCS1 *)data + len,
10029 u1, u2, maxcount);
10030 }
10031 else if (kind == PyUnicode_2BYTE_KIND) {
10032 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10033 (Py_UCS2 *)data + len,
10034 u1, u2, maxcount);
10035 }
10036 else {
10037 assert(kind == PyUnicode_4BYTE_KIND);
10038 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10039 (Py_UCS4 *)data + len,
10040 u1, u2, maxcount);
10041 }
10042}
10043
Alexander Belopolsky40018472011-02-26 01:02:56 +000010044static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045replace(PyObject *self, PyObject *str1,
10046 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 PyObject *u;
10049 char *sbuf = PyUnicode_DATA(self);
10050 char *buf1 = PyUnicode_DATA(str1);
10051 char *buf2 = PyUnicode_DATA(str2);
10052 int srelease = 0, release1 = 0, release2 = 0;
10053 int skind = PyUnicode_KIND(self);
10054 int kind1 = PyUnicode_KIND(str1);
10055 int kind2 = PyUnicode_KIND(str2);
10056 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10057 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10058 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010059 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010060 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061
10062 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010063 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010065 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066
Victor Stinner59de0ee2011-10-07 10:01:28 +020010067 if (str1 == str2)
10068 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069
Victor Stinner49a0a212011-10-12 23:46:10 +020010070 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010071 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10072 if (maxchar < maxchar_str1)
10073 /* substring too wide to be present */
10074 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010075 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10076 /* Replacing str1 with str2 may cause a maxchar reduction in the
10077 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010078 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010079 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010082 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010084 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010086 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010087 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010088 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010089
Victor Stinner69ed0f42013-04-09 21:48:24 +020010090 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010091 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010092 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010093 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010094 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010096 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010098
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010099 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10100 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010101 }
10102 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 int rkind = skind;
10104 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010105 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 if (kind1 < rkind) {
10108 /* widen substring */
10109 buf1 = _PyUnicode_AsKind(str1, rkind);
10110 if (!buf1) goto error;
10111 release1 = 1;
10112 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010113 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010114 if (i < 0)
10115 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (rkind > kind2) {
10117 /* widen replacement */
10118 buf2 = _PyUnicode_AsKind(str2, rkind);
10119 if (!buf2) goto error;
10120 release2 = 1;
10121 }
10122 else if (rkind < kind2) {
10123 /* widen self and buf1 */
10124 rkind = kind2;
10125 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010126 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 sbuf = _PyUnicode_AsKind(self, rkind);
10128 if (!sbuf) goto error;
10129 srelease = 1;
10130 buf1 = _PyUnicode_AsKind(str1, rkind);
10131 if (!buf1) goto error;
10132 release1 = 1;
10133 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010134 u = PyUnicode_New(slen, maxchar);
10135 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010137 assert(PyUnicode_KIND(u) == rkind);
10138 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010139
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010140 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010141 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010142 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010144 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010146
10147 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010148 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010149 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010151 if (i == -1)
10152 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010153 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010155 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 }
10160 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010162 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 int rkind = skind;
10164 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010167 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 buf1 = _PyUnicode_AsKind(str1, rkind);
10169 if (!buf1) goto error;
10170 release1 = 1;
10171 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010172 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010173 if (n == 0)
10174 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010176 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 buf2 = _PyUnicode_AsKind(str2, rkind);
10178 if (!buf2) goto error;
10179 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010182 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 rkind = kind2;
10184 sbuf = _PyUnicode_AsKind(self, rkind);
10185 if (!sbuf) goto error;
10186 srelease = 1;
10187 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010188 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 buf1 = _PyUnicode_AsKind(str1, rkind);
10190 if (!buf1) goto error;
10191 release1 = 1;
10192 }
10193 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10194 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010195 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 PyErr_SetString(PyExc_OverflowError,
10197 "replace string is too long");
10198 goto error;
10199 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010200 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010201 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010202 _Py_INCREF_UNICODE_EMPTY();
10203 if (!unicode_empty)
10204 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010205 u = unicode_empty;
10206 goto done;
10207 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010208 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 PyErr_SetString(PyExc_OverflowError,
10210 "replace string is too long");
10211 goto error;
10212 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010213 u = PyUnicode_New(new_size, maxchar);
10214 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010216 assert(PyUnicode_KIND(u) == rkind);
10217 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 ires = i = 0;
10219 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220 while (n-- > 0) {
10221 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010222 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010223 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010225 if (j == -1)
10226 break;
10227 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010228 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010229 memcpy(res + rkind * ires,
10230 sbuf + rkind * i,
10231 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010233 }
10234 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010236 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010238 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010245 memcpy(res + rkind * ires,
10246 sbuf + rkind * i,
10247 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010248 }
10249 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010250 /* interleave */
10251 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010252 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010254 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010256 if (--n <= 0)
10257 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010258 memcpy(res + rkind * ires,
10259 sbuf + rkind * i,
10260 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 ires++;
10262 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010263 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010264 memcpy(res + rkind * ires,
10265 sbuf + rkind * i,
10266 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010267 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010268 }
10269
10270 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010271 unicode_adjust_maxchar(&u);
10272 if (u == NULL)
10273 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010275
10276 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (srelease)
10278 PyMem_FREE(sbuf);
10279 if (release1)
10280 PyMem_FREE(buf1);
10281 if (release2)
10282 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010283 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010285
Benjamin Peterson29060642009-01-31 22:14:21 +000010286 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010287 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 if (srelease)
10289 PyMem_FREE(sbuf);
10290 if (release1)
10291 PyMem_FREE(buf1);
10292 if (release2)
10293 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010294 return unicode_result_unchanged(self);
10295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 error:
10297 if (srelease && sbuf)
10298 PyMem_FREE(sbuf);
10299 if (release1 && buf1)
10300 PyMem_FREE(buf1);
10301 if (release2 && buf2)
10302 PyMem_FREE(buf2);
10303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304}
10305
10306/* --- Unicode Object Methods --------------------------------------------- */
10307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010308PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310\n\
10311Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010312characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
10314static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010315unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010317 if (PyUnicode_READY(self) == -1)
10318 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010319 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320}
10321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010322PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010323 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324\n\
10325Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010326have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327
10328static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010329unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010331 if (PyUnicode_READY(self) == -1)
10332 return NULL;
10333 if (PyUnicode_GET_LENGTH(self) == 0)
10334 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010335 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336}
10337
Benjamin Petersond5890c82012-01-14 13:23:30 -050010338PyDoc_STRVAR(casefold__doc__,
10339 "S.casefold() -> str\n\
10340\n\
10341Return a version of S suitable for caseless comparisons.");
10342
10343static PyObject *
10344unicode_casefold(PyObject *self)
10345{
10346 if (PyUnicode_READY(self) == -1)
10347 return NULL;
10348 if (PyUnicode_IS_ASCII(self))
10349 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010350 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010351}
10352
10353
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010354/* Argument converter. Coerces to a single unicode character */
10355
10356static int
10357convert_uc(PyObject *obj, void *addr)
10358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010360 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010361
Benjamin Peterson14339b62009-01-31 16:36:08 +000010362 uniobj = PyUnicode_FromObject(obj);
10363 if (uniobj == NULL) {
10364 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010365 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010366 return 0;
10367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010369 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010370 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010371 Py_DECREF(uniobj);
10372 return 0;
10373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010375 Py_DECREF(uniobj);
10376 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010377}
10378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010379PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010380 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010382Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010383done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384
10385static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010386unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010388 Py_ssize_t marg, left;
10389 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 Py_UCS4 fillchar = ' ';
10391
Victor Stinnere9a29352011-10-01 02:14:59 +020010392 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394
Benjamin Petersonbac79492012-01-14 13:34:47 -050010395 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396 return NULL;
10397
Victor Stinnerc4b49542011-12-11 22:44:26 +010010398 if (PyUnicode_GET_LENGTH(self) >= width)
10399 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400
Victor Stinnerc4b49542011-12-11 22:44:26 +010010401 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402 left = marg / 2 + (marg & width & 1);
10403
Victor Stinner9310abb2011-10-05 00:59:23 +020010404 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405}
10406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407/* This function assumes that str1 and str2 are readied by the caller. */
10408
Marc-André Lemburge5034372000-08-08 08:04:29 +000010409static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010410unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010411{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010412#define COMPARE(TYPE1, TYPE2) \
10413 do { \
10414 TYPE1* p1 = (TYPE1 *)data1; \
10415 TYPE2* p2 = (TYPE2 *)data2; \
10416 TYPE1* end = p1 + len; \
10417 Py_UCS4 c1, c2; \
10418 for (; p1 != end; p1++, p2++) { \
10419 c1 = *p1; \
10420 c2 = *p2; \
10421 if (c1 != c2) \
10422 return (c1 < c2) ? -1 : 1; \
10423 } \
10424 } \
10425 while (0)
10426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 int kind1, kind2;
10428 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010429 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 kind1 = PyUnicode_KIND(str1);
10432 kind2 = PyUnicode_KIND(str2);
10433 data1 = PyUnicode_DATA(str1);
10434 data2 = PyUnicode_DATA(str2);
10435 len1 = PyUnicode_GET_LENGTH(str1);
10436 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010437 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010438
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010439 switch(kind1) {
10440 case PyUnicode_1BYTE_KIND:
10441 {
10442 switch(kind2) {
10443 case PyUnicode_1BYTE_KIND:
10444 {
10445 int cmp = memcmp(data1, data2, len);
10446 /* normalize result of memcmp() into the range [-1; 1] */
10447 if (cmp < 0)
10448 return -1;
10449 if (cmp > 0)
10450 return 1;
10451 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010452 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010453 case PyUnicode_2BYTE_KIND:
10454 COMPARE(Py_UCS1, Py_UCS2);
10455 break;
10456 case PyUnicode_4BYTE_KIND:
10457 COMPARE(Py_UCS1, Py_UCS4);
10458 break;
10459 default:
10460 assert(0);
10461 }
10462 break;
10463 }
10464 case PyUnicode_2BYTE_KIND:
10465 {
10466 switch(kind2) {
10467 case PyUnicode_1BYTE_KIND:
10468 COMPARE(Py_UCS2, Py_UCS1);
10469 break;
10470 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010471 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010472 COMPARE(Py_UCS2, Py_UCS2);
10473 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010474 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010475 case PyUnicode_4BYTE_KIND:
10476 COMPARE(Py_UCS2, Py_UCS4);
10477 break;
10478 default:
10479 assert(0);
10480 }
10481 break;
10482 }
10483 case PyUnicode_4BYTE_KIND:
10484 {
10485 switch(kind2) {
10486 case PyUnicode_1BYTE_KIND:
10487 COMPARE(Py_UCS4, Py_UCS1);
10488 break;
10489 case PyUnicode_2BYTE_KIND:
10490 COMPARE(Py_UCS4, Py_UCS2);
10491 break;
10492 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010493 {
10494#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10495 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10496 /* normalize result of wmemcmp() into the range [-1; 1] */
10497 if (cmp < 0)
10498 return -1;
10499 if (cmp > 0)
10500 return 1;
10501#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010502 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010503#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010504 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010505 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010506 default:
10507 assert(0);
10508 }
10509 break;
10510 }
10511 default:
10512 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010513 }
10514
Victor Stinner770e19e2012-10-04 22:59:45 +020010515 if (len1 == len2)
10516 return 0;
10517 if (len1 < len2)
10518 return -1;
10519 else
10520 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010521
10522#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010523}
10524
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010525Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010526unicode_compare_eq(PyObject *str1, PyObject *str2)
10527{
10528 int kind;
10529 void *data1, *data2;
10530 Py_ssize_t len;
10531 int cmp;
10532
Victor Stinnere5567ad2012-10-23 02:48:49 +020010533 len = PyUnicode_GET_LENGTH(str1);
10534 if (PyUnicode_GET_LENGTH(str2) != len)
10535 return 0;
10536 kind = PyUnicode_KIND(str1);
10537 if (PyUnicode_KIND(str2) != kind)
10538 return 0;
10539 data1 = PyUnicode_DATA(str1);
10540 data2 = PyUnicode_DATA(str2);
10541
10542 cmp = memcmp(data1, data2, len * kind);
10543 return (cmp == 0);
10544}
10545
10546
Alexander Belopolsky40018472011-02-26 01:02:56 +000010547int
10548PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10551 if (PyUnicode_READY(left) == -1 ||
10552 PyUnicode_READY(right) == -1)
10553 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010554
10555 /* a string is equal to itself */
10556 if (left == right)
10557 return 0;
10558
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010559 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010561 PyErr_Format(PyExc_TypeError,
10562 "Can't compare %.100s and %.100s",
10563 left->ob_type->tp_name,
10564 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565 return -1;
10566}
10567
Martin v. Löwis5b222132007-06-10 09:51:05 +000010568int
10569PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 Py_ssize_t i;
10572 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 Py_UCS4 chr;
10574
Victor Stinner910337b2011-10-03 03:20:16 +020010575 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (PyUnicode_READY(uni) == -1)
10577 return -1;
10578 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010579 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010580 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010581 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010582 size_t len, len2 = strlen(str);
10583 int cmp;
10584
10585 len = Py_MIN(len1, len2);
10586 cmp = memcmp(data, str, len);
10587 if (cmp != 0)
10588 return cmp;
10589 if (len1 > len2)
10590 return 1; /* uni is longer */
10591 if (len2 > len1)
10592 return -1; /* str is longer */
10593 return 0;
10594 }
10595 else {
10596 void *data = PyUnicode_DATA(uni);
10597 /* Compare Unicode string and source character set string */
10598 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10599 if (chr != str[i])
10600 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10601 /* This check keeps Python strings that end in '\0' from comparing equal
10602 to C strings identical up to that point. */
10603 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10604 return 1; /* uni is longer */
10605 if (str[i])
10606 return -1; /* str is longer */
10607 return 0;
10608 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010609}
10610
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010611
Benjamin Peterson29060642009-01-31 22:14:21 +000010612#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010613 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010614
Alexander Belopolsky40018472011-02-26 01:02:56 +000010615PyObject *
10616PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010617{
10618 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010619 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010620
Victor Stinnere5567ad2012-10-23 02:48:49 +020010621 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10622 Py_RETURN_NOTIMPLEMENTED;
10623
10624 if (PyUnicode_READY(left) == -1 ||
10625 PyUnicode_READY(right) == -1)
10626 return NULL;
10627
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010628 if (left == right) {
10629 switch (op) {
10630 case Py_EQ:
10631 case Py_LE:
10632 case Py_GE:
10633 /* a string is equal to itself */
10634 v = Py_True;
10635 break;
10636 case Py_NE:
10637 case Py_LT:
10638 case Py_GT:
10639 v = Py_False;
10640 break;
10641 default:
10642 PyErr_BadArgument();
10643 return NULL;
10644 }
10645 }
10646 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010647 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010648 result ^= (op == Py_NE);
10649 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010650 }
10651 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010652 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010653
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010654 /* Convert the return value to a Boolean */
10655 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010656 case Py_LE:
10657 v = TEST_COND(result <= 0);
10658 break;
10659 case Py_GE:
10660 v = TEST_COND(result >= 0);
10661 break;
10662 case Py_LT:
10663 v = TEST_COND(result == -1);
10664 break;
10665 case Py_GT:
10666 v = TEST_COND(result == 1);
10667 break;
10668 default:
10669 PyErr_BadArgument();
10670 return NULL;
10671 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010672 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010673 Py_INCREF(v);
10674 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010675}
10676
Alexander Belopolsky40018472011-02-26 01:02:56 +000010677int
10678PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010679{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010681 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 void *buf1, *buf2;
10683 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010684 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010685
10686 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687 sub = PyUnicode_FromObject(element);
10688 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 PyErr_Format(PyExc_TypeError,
10690 "'in <string>' requires string as left operand, not %s",
10691 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010692 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010693 }
10694
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010696 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697 Py_DECREF(sub);
10698 return -1;
10699 }
10700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 kind1 = PyUnicode_KIND(str);
10702 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 buf1 = PyUnicode_DATA(str);
10704 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010705 if (kind2 != kind1) {
10706 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010707 Py_DECREF(sub);
10708 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010709 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010710 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010711 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 if (!buf2) {
10714 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010715 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 return -1;
10717 }
10718 len1 = PyUnicode_GET_LENGTH(str);
10719 len2 = PyUnicode_GET_LENGTH(sub);
10720
Victor Stinner77282cb2013-04-14 19:22:47 +020010721 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 case PyUnicode_1BYTE_KIND:
10723 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10724 break;
10725 case PyUnicode_2BYTE_KIND:
10726 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10727 break;
10728 case PyUnicode_4BYTE_KIND:
10729 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10730 break;
10731 default:
10732 result = -1;
10733 assert(0);
10734 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010735
10736 Py_DECREF(str);
10737 Py_DECREF(sub);
10738
Victor Stinner77282cb2013-04-14 19:22:47 +020010739 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 PyMem_Free(buf2);
10741
Guido van Rossum403d68b2000-03-13 15:55:09 +000010742 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010743}
10744
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745/* Concat to string or Unicode object giving a new Unicode object. */
10746
Alexander Belopolsky40018472011-02-26 01:02:56 +000010747PyObject *
10748PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010751 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010752 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
10754 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010757 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761
10762 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010763 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010767 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010768 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770 }
10771
Victor Stinner488fa492011-12-12 00:01:39 +010010772 u_len = PyUnicode_GET_LENGTH(u);
10773 v_len = PyUnicode_GET_LENGTH(v);
10774 if (u_len > PY_SSIZE_T_MAX - v_len) {
10775 PyErr_SetString(PyExc_OverflowError,
10776 "strings are too large to concat");
10777 goto onError;
10778 }
10779 new_len = u_len + v_len;
10780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010782 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010783 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010786 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010789 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10790 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791 Py_DECREF(u);
10792 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010793 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797 Py_XDECREF(u);
10798 Py_XDECREF(v);
10799 return NULL;
10800}
10801
Walter Dörwald1ab83302007-05-18 17:15:44 +000010802void
Victor Stinner23e56682011-10-03 03:54:37 +020010803PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010804{
Victor Stinner23e56682011-10-03 03:54:37 +020010805 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010806 Py_UCS4 maxchar, maxchar2;
10807 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010808
10809 if (p_left == NULL) {
10810 if (!PyErr_Occurred())
10811 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010812 return;
10813 }
Victor Stinner23e56682011-10-03 03:54:37 +020010814 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020010815 if (right == NULL || left == NULL
10816 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010817 if (!PyErr_Occurred())
10818 PyErr_BadInternalCall();
10819 goto error;
10820 }
10821
Benjamin Petersonbac79492012-01-14 13:34:47 -050010822 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010823 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010824 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010825 goto error;
10826
Victor Stinner488fa492011-12-12 00:01:39 +010010827 /* Shortcuts */
10828 if (left == unicode_empty) {
10829 Py_DECREF(left);
10830 Py_INCREF(right);
10831 *p_left = right;
10832 return;
10833 }
10834 if (right == unicode_empty)
10835 return;
10836
10837 left_len = PyUnicode_GET_LENGTH(left);
10838 right_len = PyUnicode_GET_LENGTH(right);
10839 if (left_len > PY_SSIZE_T_MAX - right_len) {
10840 PyErr_SetString(PyExc_OverflowError,
10841 "strings are too large to concat");
10842 goto error;
10843 }
10844 new_len = left_len + right_len;
10845
10846 if (unicode_modifiable(left)
10847 && PyUnicode_CheckExact(right)
10848 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010849 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10850 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010851 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010852 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010853 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10854 {
10855 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010856 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010010857 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020010858
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010859 /* copy 'right' into the newly allocated area of 'left' */
10860 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010861 }
Victor Stinner488fa492011-12-12 00:01:39 +010010862 else {
10863 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10864 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010865 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010866
Victor Stinner488fa492011-12-12 00:01:39 +010010867 /* Concat the two Unicode strings */
10868 res = PyUnicode_New(new_len, maxchar);
10869 if (res == NULL)
10870 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010871 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10872 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010873 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010874 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010010875 }
10876 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010877 return;
10878
10879error:
Victor Stinner488fa492011-12-12 00:01:39 +010010880 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010881}
10882
10883void
10884PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10885{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886 PyUnicode_Append(pleft, right);
10887 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010888}
10889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010890PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010891 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010893Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010894string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010895interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896
10897static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010898unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010900 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010901 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010902 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 int kind1, kind2, kind;
10905 void *buf1, *buf2;
10906 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907
Jesus Ceaac451502011-04-20 17:09:23 +020010908 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10909 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010910 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 kind1 = PyUnicode_KIND(self);
10913 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020010914 if (kind2 > kind1) {
10915 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010916 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020010917 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010918 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 buf1 = PyUnicode_DATA(self);
10920 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010922 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 if (!buf2) {
10924 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 return NULL;
10926 }
10927 len1 = PyUnicode_GET_LENGTH(self);
10928 len2 = PyUnicode_GET_LENGTH(substring);
10929
10930 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010931 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 case PyUnicode_1BYTE_KIND:
10933 iresult = ucs1lib_count(
10934 ((Py_UCS1*)buf1) + start, end - start,
10935 buf2, len2, PY_SSIZE_T_MAX
10936 );
10937 break;
10938 case PyUnicode_2BYTE_KIND:
10939 iresult = ucs2lib_count(
10940 ((Py_UCS2*)buf1) + start, end - start,
10941 buf2, len2, PY_SSIZE_T_MAX
10942 );
10943 break;
10944 case PyUnicode_4BYTE_KIND:
10945 iresult = ucs4lib_count(
10946 ((Py_UCS4*)buf1) + start, end - start,
10947 buf2, len2, PY_SSIZE_T_MAX
10948 );
10949 break;
10950 default:
10951 assert(0); iresult = 0;
10952 }
10953
10954 result = PyLong_FromSsize_t(iresult);
10955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 if (kind2 != kind)
10957 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010960
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961 return result;
10962}
10963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010964PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010965 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010967Encode S using the codec registered for encoding. Default encoding\n\
10968is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010969handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010970a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10971'xmlcharrefreplace' as well as any other name registered with\n\
10972codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
10974static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010975unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010977 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978 char *encoding = NULL;
10979 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010980
Benjamin Peterson308d6372009-09-18 21:42:35 +000010981 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10982 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010984 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010985}
10986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010987PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989\n\
10990Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010991If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992
10993static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010994unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010996 Py_ssize_t i, j, line_pos, src_len, incr;
10997 Py_UCS4 ch;
10998 PyObject *u;
10999 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011001 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011002 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
11004 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006
Antoine Pitrou22425222011-10-04 19:10:51 +020011007 if (PyUnicode_READY(self) == -1)
11008 return NULL;
11009
Thomas Wouters7e474022000-07-16 12:04:32 +000011010 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011011 src_len = PyUnicode_GET_LENGTH(self);
11012 i = j = line_pos = 0;
11013 kind = PyUnicode_KIND(self);
11014 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011015 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011016 for (; i < src_len; i++) {
11017 ch = PyUnicode_READ(kind, src_data, i);
11018 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011019 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011021 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011023 goto overflow;
11024 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011025 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011026 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011029 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011030 goto overflow;
11031 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011033 if (ch == '\n' || ch == '\r')
11034 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011036 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011037 if (!found)
11038 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011039
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011041 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 if (!u)
11043 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011044 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
Antoine Pitroue71d5742011-10-04 15:55:09 +020011046 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047
Antoine Pitroue71d5742011-10-04 15:55:09 +020011048 for (; i < src_len; i++) {
11049 ch = PyUnicode_READ(kind, src_data, i);
11050 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011051 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011052 incr = tabsize - (line_pos % tabsize);
11053 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011054 FILL(kind, dest_data, ' ', j, incr);
11055 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011057 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011058 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011059 line_pos++;
11060 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011061 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011062 if (ch == '\n' || ch == '\r')
11063 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011065 }
11066 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011067 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011068
Antoine Pitroue71d5742011-10-04 15:55:09 +020011069 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011070 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11071 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072}
11073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011074PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011075 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076\n\
11077Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011078such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079arguments start and end are interpreted as in slice notation.\n\
11080\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011081Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082
11083static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011086 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011087 Py_ssize_t start;
11088 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011089 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090
Jesus Ceaac451502011-04-20 17:09:23 +020011091 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11092 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094
Christian Heimesd47802e2013-06-29 21:33:36 +020011095 if (PyUnicode_READY(self) == -1) {
11096 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011098 }
11099 if (PyUnicode_READY(substring) == -1) {
11100 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103
Victor Stinner7931d9a2011-11-04 00:22:48 +010011104 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105
11106 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011108 if (result == -2)
11109 return NULL;
11110
Christian Heimes217cfd12007-12-02 14:31:20 +000011111 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112}
11113
11114static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011115unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011117 void *data;
11118 enum PyUnicode_Kind kind;
11119 Py_UCS4 ch;
11120 PyObject *res;
11121
11122 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11123 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011125 }
11126 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11127 PyErr_SetString(PyExc_IndexError, "string index out of range");
11128 return NULL;
11129 }
11130 kind = PyUnicode_KIND(self);
11131 data = PyUnicode_DATA(self);
11132 ch = PyUnicode_READ(kind, data, index);
11133 if (ch < 256)
11134 return get_latin1_char(ch);
11135
11136 res = PyUnicode_New(1, ch);
11137 if (res == NULL)
11138 return NULL;
11139 kind = PyUnicode_KIND(res);
11140 data = PyUnicode_DATA(res);
11141 PyUnicode_WRITE(kind, data, 0, ch);
11142 assert(_PyUnicode_CheckConsistency(res, 1));
11143 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144}
11145
Guido van Rossumc2504932007-09-18 19:42:40 +000011146/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011147 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011148static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011149unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150{
Guido van Rossumc2504932007-09-18 19:42:40 +000011151 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011152 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011153
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011154#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011155 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011156#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (_PyUnicode_HASH(self) != -1)
11158 return _PyUnicode_HASH(self);
11159 if (PyUnicode_READY(self) == -1)
11160 return -1;
11161 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011162 /*
11163 We make the hash of the empty string be 0, rather than using
11164 (prefix ^ suffix), since this slightly obfuscates the hash secret
11165 */
11166 if (len == 0) {
11167 _PyUnicode_HASH(self) = 0;
11168 return 0;
11169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170
11171 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011172#define HASH(P) \
11173 x ^= (Py_uhash_t) *P << 7; \
11174 while (--len >= 0) \
11175 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176
Georg Brandl2fb477c2012-02-21 00:33:36 +010011177 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 switch (PyUnicode_KIND(self)) {
11179 case PyUnicode_1BYTE_KIND: {
11180 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11181 HASH(c);
11182 break;
11183 }
11184 case PyUnicode_2BYTE_KIND: {
11185 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11186 HASH(s);
11187 break;
11188 }
11189 default: {
11190 Py_UCS4 *l;
11191 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11192 "Impossible switch case in unicode_hash");
11193 l = PyUnicode_4BYTE_DATA(self);
11194 HASH(l);
11195 break;
11196 }
11197 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011198 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11199 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200
Guido van Rossumc2504932007-09-18 19:42:40 +000011201 if (x == -1)
11202 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011204 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011208PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011209 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011211Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
11213static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011216 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011217 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011218 Py_ssize_t start;
11219 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220
Jesus Ceaac451502011-04-20 17:09:23 +020011221 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11222 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224
Christian Heimesd47a0452013-06-29 21:21:37 +020011225 if (PyUnicode_READY(self) == -1) {
11226 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011228 }
11229 if (PyUnicode_READY(substring) == -1) {
11230 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233
Victor Stinner7931d9a2011-11-04 00:22:48 +010011234 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
11236 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 if (result == -2)
11239 return NULL;
11240
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 if (result < 0) {
11242 PyErr_SetString(PyExc_ValueError, "substring not found");
11243 return NULL;
11244 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011245
Christian Heimes217cfd12007-12-02 14:31:20 +000011246 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247}
11248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011249PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011250 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011252Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011253at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
11255static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011256unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 Py_ssize_t i, length;
11259 int kind;
11260 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261 int cased;
11262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 if (PyUnicode_READY(self) == -1)
11264 return NULL;
11265 length = PyUnicode_GET_LENGTH(self);
11266 kind = PyUnicode_KIND(self);
11267 data = PyUnicode_DATA(self);
11268
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 if (length == 1)
11271 return PyBool_FromLong(
11272 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011274 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011276 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011277
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011279 for (i = 0; i < length; i++) {
11280 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011281
Benjamin Peterson29060642009-01-31 22:14:21 +000011282 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11283 return PyBool_FromLong(0);
11284 else if (!cased && Py_UNICODE_ISLOWER(ch))
11285 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011287 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288}
11289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011290PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011293Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011294at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
11296static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011297unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011299 Py_ssize_t i, length;
11300 int kind;
11301 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302 int cased;
11303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 if (PyUnicode_READY(self) == -1)
11305 return NULL;
11306 length = PyUnicode_GET_LENGTH(self);
11307 kind = PyUnicode_KIND(self);
11308 data = PyUnicode_DATA(self);
11309
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 if (length == 1)
11312 return PyBool_FromLong(
11313 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011315 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011318
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 for (i = 0; i < length; i++) {
11321 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011322
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11324 return PyBool_FromLong(0);
11325 else if (!cased && Py_UNICODE_ISUPPER(ch))
11326 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011328 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329}
11330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011331PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011334Return True if S is a titlecased string and there is at least one\n\
11335character in S, i.e. upper- and titlecase characters may only\n\
11336follow uncased characters and lowercase characters only cased ones.\n\
11337Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
11339static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011340unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 Py_ssize_t i, length;
11343 int kind;
11344 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 int cased, previous_is_cased;
11346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 if (PyUnicode_READY(self) == -1)
11348 return NULL;
11349 length = PyUnicode_GET_LENGTH(self);
11350 kind = PyUnicode_KIND(self);
11351 data = PyUnicode_DATA(self);
11352
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 if (length == 1) {
11355 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11356 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11357 (Py_UNICODE_ISUPPER(ch) != 0));
11358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011360 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011363
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364 cased = 0;
11365 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 for (i = 0; i < length; i++) {
11367 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011368
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11370 if (previous_is_cased)
11371 return PyBool_FromLong(0);
11372 previous_is_cased = 1;
11373 cased = 1;
11374 }
11375 else if (Py_UNICODE_ISLOWER(ch)) {
11376 if (!previous_is_cased)
11377 return PyBool_FromLong(0);
11378 previous_is_cased = 1;
11379 cased = 1;
11380 }
11381 else
11382 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011384 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385}
11386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011387PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011390Return True if all characters in S are whitespace\n\
11391and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392
11393static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011394unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 Py_ssize_t i, length;
11397 int kind;
11398 void *data;
11399
11400 if (PyUnicode_READY(self) == -1)
11401 return NULL;
11402 length = PyUnicode_GET_LENGTH(self);
11403 kind = PyUnicode_KIND(self);
11404 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 if (length == 1)
11408 return PyBool_FromLong(
11409 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011411 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011413 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 for (i = 0; i < length; i++) {
11416 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011417 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011420 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421}
11422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011423PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011424 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011425\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011426Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011427and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011428
11429static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011430unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 Py_ssize_t i, length;
11433 int kind;
11434 void *data;
11435
11436 if (PyUnicode_READY(self) == -1)
11437 return NULL;
11438 length = PyUnicode_GET_LENGTH(self);
11439 kind = PyUnicode_KIND(self);
11440 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011441
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011442 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 if (length == 1)
11444 return PyBool_FromLong(
11445 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011446
11447 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 for (i = 0; i < length; i++) {
11452 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011454 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011455 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011456}
11457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011458PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011460\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011461Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011462and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011463
11464static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011465unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 int kind;
11468 void *data;
11469 Py_ssize_t len, i;
11470
11471 if (PyUnicode_READY(self) == -1)
11472 return NULL;
11473
11474 kind = PyUnicode_KIND(self);
11475 data = PyUnicode_DATA(self);
11476 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011477
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011478 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 if (len == 1) {
11480 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11481 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11482 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011483
11484 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 for (i = 0; i < len; i++) {
11489 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011490 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011492 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011493 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011494}
11495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011496PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011499Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011500False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
11502static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011503unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 Py_ssize_t i, length;
11506 int kind;
11507 void *data;
11508
11509 if (PyUnicode_READY(self) == -1)
11510 return NULL;
11511 length = PyUnicode_GET_LENGTH(self);
11512 kind = PyUnicode_KIND(self);
11513 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 if (length == 1)
11517 return PyBool_FromLong(
11518 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011520 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 for (i = 0; i < length; i++) {
11525 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011528 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529}
11530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011531PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011534Return True if all characters in S are digits\n\
11535and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
11537static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011538unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 Py_ssize_t i, length;
11541 int kind;
11542 void *data;
11543
11544 if (PyUnicode_READY(self) == -1)
11545 return NULL;
11546 length = PyUnicode_GET_LENGTH(self);
11547 kind = PyUnicode_KIND(self);
11548 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (length == 1) {
11552 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11553 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011556 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 for (i = 0; i < length; i++) {
11561 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011564 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565}
11566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011567PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011570Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011571False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572
11573static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011574unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 Py_ssize_t i, length;
11577 int kind;
11578 void *data;
11579
11580 if (PyUnicode_READY(self) == -1)
11581 return NULL;
11582 length = PyUnicode_GET_LENGTH(self);
11583 kind = PyUnicode_KIND(self);
11584 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 if (length == 1)
11588 return PyBool_FromLong(
11589 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011591 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 for (i = 0; i < length; i++) {
11596 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011599 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600}
11601
Martin v. Löwis47383402007-08-15 07:32:56 +000011602int
11603PyUnicode_IsIdentifier(PyObject *self)
11604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 int kind;
11606 void *data;
11607 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011608 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 if (PyUnicode_READY(self) == -1) {
11611 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 }
11614
11615 /* Special case for empty strings */
11616 if (PyUnicode_GET_LENGTH(self) == 0)
11617 return 0;
11618 kind = PyUnicode_KIND(self);
11619 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011620
11621 /* PEP 3131 says that the first character must be in
11622 XID_Start and subsequent characters in XID_Continue,
11623 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011624 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011625 letters, digits, underscore). However, given the current
11626 definition of XID_Start and XID_Continue, it is sufficient
11627 to check just for these, except that _ must be allowed
11628 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011630 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011631 return 0;
11632
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011633 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011635 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011636 return 1;
11637}
11638
11639PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011641\n\
11642Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011643to the language definition.\n\
11644\n\
11645Use keyword.iskeyword() to test for reserved identifiers\n\
11646such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011647
11648static PyObject*
11649unicode_isidentifier(PyObject *self)
11650{
11651 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11652}
11653
Georg Brandl559e5d72008-06-11 18:37:52 +000011654PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011656\n\
11657Return True if all characters in S are considered\n\
11658printable in repr() or S is empty, False otherwise.");
11659
11660static PyObject*
11661unicode_isprintable(PyObject *self)
11662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 Py_ssize_t i, length;
11664 int kind;
11665 void *data;
11666
11667 if (PyUnicode_READY(self) == -1)
11668 return NULL;
11669 length = PyUnicode_GET_LENGTH(self);
11670 kind = PyUnicode_KIND(self);
11671 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011672
11673 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 if (length == 1)
11675 return PyBool_FromLong(
11676 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 for (i = 0; i < length; i++) {
11679 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011680 Py_RETURN_FALSE;
11681 }
11682 }
11683 Py_RETURN_TRUE;
11684}
11685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011686PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011687 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688\n\
11689Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011690iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
11692static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011693unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011695 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696}
11697
Martin v. Löwis18e16552006-02-15 17:27:45 +000011698static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011699unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 if (PyUnicode_READY(self) == -1)
11702 return -1;
11703 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704}
11705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011706PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011707 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011709Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011710done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711
11712static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011713unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011715 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 Py_UCS4 fillchar = ' ';
11717
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011718 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 return NULL;
11720
Benjamin Petersonbac79492012-01-14 13:34:47 -050011721 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011722 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
Victor Stinnerc4b49542011-12-11 22:44:26 +010011724 if (PyUnicode_GET_LENGTH(self) >= width)
11725 return unicode_result_unchanged(self);
11726
11727 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728}
11729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011730PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011731 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011733Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734
11735static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011736unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011738 if (PyUnicode_READY(self) == -1)
11739 return NULL;
11740 if (PyUnicode_IS_ASCII(self))
11741 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011742 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743}
11744
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011745#define LEFTSTRIP 0
11746#define RIGHTSTRIP 1
11747#define BOTHSTRIP 2
11748
11749/* Arrays indexed by above */
11750static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11751
11752#define STRIPNAME(i) (stripformat[i]+3)
11753
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011754/* externally visible for str.strip(unicode) */
11755PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011756_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 void *data;
11759 int kind;
11760 Py_ssize_t i, j, len;
11761 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011762 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11765 return NULL;
11766
11767 kind = PyUnicode_KIND(self);
11768 data = PyUnicode_DATA(self);
11769 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011770 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11772 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011773 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011774
Benjamin Peterson14339b62009-01-31 16:36:08 +000011775 i = 0;
11776 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011777 while (i < len) {
11778 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11779 if (!BLOOM(sepmask, ch))
11780 break;
11781 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11782 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 i++;
11784 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011785 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011786
Benjamin Peterson14339b62009-01-31 16:36:08 +000011787 j = len;
11788 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011789 j--;
11790 while (j >= i) {
11791 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11792 if (!BLOOM(sepmask, ch))
11793 break;
11794 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11795 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011797 }
11798
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011801
Victor Stinner7931d9a2011-11-04 00:22:48 +010011802 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803}
11804
11805PyObject*
11806PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11807{
11808 unsigned char *data;
11809 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011810 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811
Victor Stinnerde636f32011-10-01 03:55:54 +020011812 if (PyUnicode_READY(self) == -1)
11813 return NULL;
11814
Victor Stinner684d5fd2012-05-03 02:32:34 +020011815 length = PyUnicode_GET_LENGTH(self);
11816 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011817
Victor Stinner684d5fd2012-05-03 02:32:34 +020011818 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011819 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820
Victor Stinnerde636f32011-10-01 03:55:54 +020011821 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011822 PyErr_SetString(PyExc_IndexError, "string index out of range");
11823 return NULL;
11824 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011825 if (start >= length || end < start)
11826 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011827
Victor Stinner684d5fd2012-05-03 02:32:34 +020011828 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011829 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011830 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011831 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011832 }
11833 else {
11834 kind = PyUnicode_KIND(self);
11835 data = PyUnicode_1BYTE_DATA(self);
11836 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011837 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011838 length);
11839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841
11842static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011843do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 Py_ssize_t len, i, j;
11846
11847 if (PyUnicode_READY(self) == -1)
11848 return NULL;
11849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011851
Victor Stinnercc7af722013-04-09 22:39:24 +020011852 if (PyUnicode_IS_ASCII(self)) {
11853 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11854
11855 i = 0;
11856 if (striptype != RIGHTSTRIP) {
11857 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011858 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020011859 if (!_Py_ascii_whitespace[ch])
11860 break;
11861 i++;
11862 }
11863 }
11864
11865 j = len;
11866 if (striptype != LEFTSTRIP) {
11867 j--;
11868 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011869 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020011870 if (!_Py_ascii_whitespace[ch])
11871 break;
11872 j--;
11873 }
11874 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011875 }
11876 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011877 else {
11878 int kind = PyUnicode_KIND(self);
11879 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011880
Victor Stinnercc7af722013-04-09 22:39:24 +020011881 i = 0;
11882 if (striptype != RIGHTSTRIP) {
11883 while (i < len) {
11884 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11885 if (!Py_UNICODE_ISSPACE(ch))
11886 break;
11887 i++;
11888 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011889 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011890
11891 j = len;
11892 if (striptype != LEFTSTRIP) {
11893 j--;
11894 while (j >= i) {
11895 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11896 if (!Py_UNICODE_ISSPACE(ch))
11897 break;
11898 j--;
11899 }
11900 j++;
11901 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011902 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011903
Victor Stinner7931d9a2011-11-04 00:22:48 +010011904 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905}
11906
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011907
11908static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011909do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011910{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011911 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011912
Serhiy Storchakac6792272013-10-19 21:03:34 +030011913 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011914 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011915
Benjamin Peterson14339b62009-01-31 16:36:08 +000011916 if (sep != NULL && sep != Py_None) {
11917 if (PyUnicode_Check(sep))
11918 return _PyUnicode_XStrip(self, striptype, sep);
11919 else {
11920 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 "%s arg must be None or str",
11922 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011923 return NULL;
11924 }
11925 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011926
Benjamin Peterson14339b62009-01-31 16:36:08 +000011927 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011928}
11929
11930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011931PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011933\n\
11934Return a copy of the string S with leading and trailing\n\
11935whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011936If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011937
11938static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011939unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011940{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011941 if (PyTuple_GET_SIZE(args) == 0)
11942 return do_strip(self, BOTHSTRIP); /* Common case */
11943 else
11944 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011945}
11946
11947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011948PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011950\n\
11951Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011952If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011953
11954static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011955unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011956{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011957 if (PyTuple_GET_SIZE(args) == 0)
11958 return do_strip(self, LEFTSTRIP); /* Common case */
11959 else
11960 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011961}
11962
11963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011964PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011966\n\
11967Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011968If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011969
11970static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011971unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011972{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011973 if (PyTuple_GET_SIZE(args) == 0)
11974 return do_strip(self, RIGHTSTRIP); /* Common case */
11975 else
11976 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011977}
11978
11979
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011981unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011983 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985
Serhiy Storchaka05997252013-01-26 12:14:02 +020011986 if (len < 1)
11987 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988
Victor Stinnerc4b49542011-12-11 22:44:26 +010011989 /* no repeat, return original string */
11990 if (len == 1)
11991 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011992
Benjamin Petersonbac79492012-01-14 13:34:47 -050011993 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 return NULL;
11995
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011996 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011997 PyErr_SetString(PyExc_OverflowError,
11998 "repeated string is too long");
11999 return NULL;
12000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012002
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012003 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004 if (!u)
12005 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012006 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (PyUnicode_GET_LENGTH(str) == 1) {
12009 const int kind = PyUnicode_KIND(str);
12010 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012011 if (kind == PyUnicode_1BYTE_KIND) {
12012 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012013 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012014 }
12015 else if (kind == PyUnicode_2BYTE_KIND) {
12016 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012017 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012018 ucs2[n] = fill_char;
12019 } else {
12020 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12021 assert(kind == PyUnicode_4BYTE_KIND);
12022 for (n = 0; n < len; ++n)
12023 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 }
12026 else {
12027 /* number of characters copied this far */
12028 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012029 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 char *to = (char *) PyUnicode_DATA(u);
12031 Py_MEMCPY(to, PyUnicode_DATA(str),
12032 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 n = (done <= nchars-done) ? done : nchars-done;
12035 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012036 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 }
12039
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012040 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012041 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042}
12043
Alexander Belopolsky40018472011-02-26 01:02:56 +000012044PyObject *
12045PyUnicode_Replace(PyObject *obj,
12046 PyObject *subobj,
12047 PyObject *replobj,
12048 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049{
12050 PyObject *self;
12051 PyObject *str1;
12052 PyObject *str2;
12053 PyObject *result;
12054
12055 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012056 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012059 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 Py_DECREF(self);
12061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 }
12063 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012064 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 Py_DECREF(self);
12066 Py_DECREF(str1);
12067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012069 if (PyUnicode_READY(self) == -1 ||
12070 PyUnicode_READY(str1) == -1 ||
12071 PyUnicode_READY(str2) == -1)
12072 result = NULL;
12073 else
12074 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 Py_DECREF(self);
12076 Py_DECREF(str1);
12077 Py_DECREF(str2);
12078 return result;
12079}
12080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012081PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012082 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083\n\
12084Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012085old replaced by new. If the optional argument count is\n\
12086given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
12088static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 PyObject *str1;
12092 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012093 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094 PyObject *result;
12095
Martin v. Löwis18e16552006-02-15 17:27:45 +000012096 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012098 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012101 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 return NULL;
12103 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012104 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012105 Py_DECREF(str1);
12106 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012107 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012108 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12109 result = NULL;
12110 else
12111 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
12113 Py_DECREF(str1);
12114 Py_DECREF(str2);
12115 return result;
12116}
12117
Alexander Belopolsky40018472011-02-26 01:02:56 +000012118static PyObject *
12119unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012121 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 Py_ssize_t isize;
12123 Py_ssize_t osize, squote, dquote, i, o;
12124 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012125 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012129 return NULL;
12130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 isize = PyUnicode_GET_LENGTH(unicode);
12132 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 /* Compute length of output, quote characters, and
12135 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012136 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 max = 127;
12138 squote = dquote = 0;
12139 ikind = PyUnicode_KIND(unicode);
12140 for (i = 0; i < isize; i++) {
12141 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12142 switch (ch) {
12143 case '\'': squote++; osize++; break;
12144 case '"': dquote++; osize++; break;
12145 case '\\': case '\t': case '\r': case '\n':
12146 osize += 2; break;
12147 default:
12148 /* Fast-path ASCII */
12149 if (ch < ' ' || ch == 0x7f)
12150 osize += 4; /* \xHH */
12151 else if (ch < 0x7f)
12152 osize++;
12153 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12154 osize++;
12155 max = ch > max ? ch : max;
12156 }
12157 else if (ch < 0x100)
12158 osize += 4; /* \xHH */
12159 else if (ch < 0x10000)
12160 osize += 6; /* \uHHHH */
12161 else
12162 osize += 10; /* \uHHHHHHHH */
12163 }
12164 }
12165
12166 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012167 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012169 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 if (dquote)
12171 /* Both squote and dquote present. Use squote,
12172 and escape them */
12173 osize += squote;
12174 else
12175 quote = '"';
12176 }
Victor Stinner55c08782013-04-14 18:45:39 +020012177 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178
12179 repr = PyUnicode_New(osize, max);
12180 if (repr == NULL)
12181 return NULL;
12182 okind = PyUnicode_KIND(repr);
12183 odata = PyUnicode_DATA(repr);
12184
12185 PyUnicode_WRITE(okind, odata, 0, quote);
12186 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012187 if (unchanged) {
12188 _PyUnicode_FastCopyCharacters(repr, 1,
12189 unicode, 0,
12190 isize);
12191 }
12192 else {
12193 for (i = 0, o = 1; i < isize; i++) {
12194 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195
Victor Stinner55c08782013-04-14 18:45:39 +020012196 /* Escape quotes and backslashes */
12197 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012198 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012200 continue;
12201 }
12202
12203 /* Map special whitespace to '\t', \n', '\r' */
12204 if (ch == '\t') {
12205 PyUnicode_WRITE(okind, odata, o++, '\\');
12206 PyUnicode_WRITE(okind, odata, o++, 't');
12207 }
12208 else if (ch == '\n') {
12209 PyUnicode_WRITE(okind, odata, o++, '\\');
12210 PyUnicode_WRITE(okind, odata, o++, 'n');
12211 }
12212 else if (ch == '\r') {
12213 PyUnicode_WRITE(okind, odata, o++, '\\');
12214 PyUnicode_WRITE(okind, odata, o++, 'r');
12215 }
12216
12217 /* Map non-printable US ASCII to '\xhh' */
12218 else if (ch < ' ' || ch == 0x7F) {
12219 PyUnicode_WRITE(okind, odata, o++, '\\');
12220 PyUnicode_WRITE(okind, odata, o++, 'x');
12221 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12222 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12223 }
12224
12225 /* Copy ASCII characters as-is */
12226 else if (ch < 0x7F) {
12227 PyUnicode_WRITE(okind, odata, o++, ch);
12228 }
12229
12230 /* Non-ASCII characters */
12231 else {
12232 /* Map Unicode whitespace and control characters
12233 (categories Z* and C* except ASCII space)
12234 */
12235 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12236 PyUnicode_WRITE(okind, odata, o++, '\\');
12237 /* Map 8-bit characters to '\xhh' */
12238 if (ch <= 0xff) {
12239 PyUnicode_WRITE(okind, odata, o++, 'x');
12240 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12241 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12242 }
12243 /* Map 16-bit characters to '\uxxxx' */
12244 else if (ch <= 0xffff) {
12245 PyUnicode_WRITE(okind, odata, o++, 'u');
12246 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12247 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12248 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12249 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12250 }
12251 /* Map 21-bit characters to '\U00xxxxxx' */
12252 else {
12253 PyUnicode_WRITE(okind, odata, o++, 'U');
12254 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12255 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12256 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12257 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12258 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12259 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12260 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12261 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12262 }
12263 }
12264 /* Copy characters as-is */
12265 else {
12266 PyUnicode_WRITE(okind, odata, o++, ch);
12267 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012268 }
12269 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012272 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012273 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274}
12275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012276PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278\n\
12279Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012280such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281arguments start and end are interpreted as in slice notation.\n\
12282\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012283Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284
12285static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012288 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012289 Py_ssize_t start;
12290 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292
Jesus Ceaac451502011-04-20 17:09:23 +020012293 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12294 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
Christian Heimesea71a522013-06-29 21:17:34 +020012297 if (PyUnicode_READY(self) == -1) {
12298 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012300 }
12301 if (PyUnicode_READY(substring) == -1) {
12302 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305
Victor Stinner7931d9a2011-11-04 00:22:48 +010012306 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
12308 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (result == -2)
12311 return NULL;
12312
Christian Heimes217cfd12007-12-02 14:31:20 +000012313 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314}
12315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012316PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012317 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012319Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320
12321static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012324 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012325 Py_ssize_t start;
12326 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012327 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
Jesus Ceaac451502011-04-20 17:09:23 +020012329 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12330 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332
Christian Heimesea71a522013-06-29 21:17:34 +020012333 if (PyUnicode_READY(self) == -1) {
12334 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012336 }
12337 if (PyUnicode_READY(substring) == -1) {
12338 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341
Victor Stinner7931d9a2011-11-04 00:22:48 +010012342 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343
12344 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 if (result == -2)
12347 return NULL;
12348
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349 if (result < 0) {
12350 PyErr_SetString(PyExc_ValueError, "substring not found");
12351 return NULL;
12352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353
Christian Heimes217cfd12007-12-02 14:31:20 +000012354 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355}
12356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012357PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012360Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012361done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
12363static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012364unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012366 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 Py_UCS4 fillchar = ' ';
12368
Victor Stinnere9a29352011-10-01 02:14:59 +020012369 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012371
Benjamin Petersonbac79492012-01-14 13:34:47 -050012372 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373 return NULL;
12374
Victor Stinnerc4b49542011-12-11 22:44:26 +010012375 if (PyUnicode_GET_LENGTH(self) >= width)
12376 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
Victor Stinnerc4b49542011-12-11 22:44:26 +010012378 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379}
12380
Alexander Belopolsky40018472011-02-26 01:02:56 +000012381PyObject *
12382PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383{
12384 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012385
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 s = PyUnicode_FromObject(s);
12387 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012388 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 if (sep != NULL) {
12390 sep = PyUnicode_FromObject(sep);
12391 if (sep == NULL) {
12392 Py_DECREF(s);
12393 return NULL;
12394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395 }
12396
Victor Stinner9310abb2011-10-05 00:59:23 +020012397 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012398
12399 Py_DECREF(s);
12400 Py_XDECREF(sep);
12401 return result;
12402}
12403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012404PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012405 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406\n\
12407Return a list of the words in S, using sep as the\n\
12408delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012409splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012410whitespace string is a separator and empty strings are\n\
12411removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412
12413static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012414unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012416 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012418 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012420 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12421 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 return NULL;
12423
12424 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012425 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012427 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012429 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430}
12431
Thomas Wouters477c8d52006-05-27 19:21:47 +000012432PyObject *
12433PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12434{
12435 PyObject* str_obj;
12436 PyObject* sep_obj;
12437 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 int kind1, kind2, kind;
12439 void *buf1 = NULL, *buf2 = NULL;
12440 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012441
12442 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012443 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012446 if (!sep_obj) {
12447 Py_DECREF(str_obj);
12448 return NULL;
12449 }
12450 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12451 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012452 Py_DECREF(str_obj);
12453 return NULL;
12454 }
12455
Victor Stinner14f8f022011-10-05 20:58:25 +020012456 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012458 kind = Py_MAX(kind1, kind2);
12459 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012461 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 if (!buf1)
12463 goto onError;
12464 buf2 = PyUnicode_DATA(sep_obj);
12465 if (kind2 != kind)
12466 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12467 if (!buf2)
12468 goto onError;
12469 len1 = PyUnicode_GET_LENGTH(str_obj);
12470 len2 = PyUnicode_GET_LENGTH(sep_obj);
12471
Benjamin Petersonead6b532011-12-20 17:23:42 -060012472 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012474 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12475 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12476 else
12477 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 break;
12479 case PyUnicode_2BYTE_KIND:
12480 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12481 break;
12482 case PyUnicode_4BYTE_KIND:
12483 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12484 break;
12485 default:
12486 assert(0);
12487 out = 0;
12488 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012489
12490 Py_DECREF(sep_obj);
12491 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 if (kind1 != kind)
12493 PyMem_Free(buf1);
12494 if (kind2 != kind)
12495 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012496
12497 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 onError:
12499 Py_DECREF(sep_obj);
12500 Py_DECREF(str_obj);
12501 if (kind1 != kind && buf1)
12502 PyMem_Free(buf1);
12503 if (kind2 != kind && buf2)
12504 PyMem_Free(buf2);
12505 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012506}
12507
12508
12509PyObject *
12510PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12511{
12512 PyObject* str_obj;
12513 PyObject* sep_obj;
12514 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 int kind1, kind2, kind;
12516 void *buf1 = NULL, *buf2 = NULL;
12517 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012518
12519 str_obj = PyUnicode_FromObject(str_in);
12520 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012521 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012522 sep_obj = PyUnicode_FromObject(sep_in);
12523 if (!sep_obj) {
12524 Py_DECREF(str_obj);
12525 return NULL;
12526 }
12527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 kind1 = PyUnicode_KIND(str_in);
12529 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012530 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 buf1 = PyUnicode_DATA(str_in);
12532 if (kind1 != kind)
12533 buf1 = _PyUnicode_AsKind(str_in, kind);
12534 if (!buf1)
12535 goto onError;
12536 buf2 = PyUnicode_DATA(sep_obj);
12537 if (kind2 != kind)
12538 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12539 if (!buf2)
12540 goto onError;
12541 len1 = PyUnicode_GET_LENGTH(str_obj);
12542 len2 = PyUnicode_GET_LENGTH(sep_obj);
12543
Benjamin Petersonead6b532011-12-20 17:23:42 -060012544 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012546 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12547 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12548 else
12549 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 break;
12551 case PyUnicode_2BYTE_KIND:
12552 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12553 break;
12554 case PyUnicode_4BYTE_KIND:
12555 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12556 break;
12557 default:
12558 assert(0);
12559 out = 0;
12560 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012561
12562 Py_DECREF(sep_obj);
12563 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 if (kind1 != kind)
12565 PyMem_Free(buf1);
12566 if (kind2 != kind)
12567 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012568
12569 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 onError:
12571 Py_DECREF(sep_obj);
12572 Py_DECREF(str_obj);
12573 if (kind1 != kind && buf1)
12574 PyMem_Free(buf1);
12575 if (kind2 != kind && buf2)
12576 PyMem_Free(buf2);
12577 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012578}
12579
12580PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012581 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012582\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012583Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012584the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012585found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012586
12587static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012588unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012589{
Victor Stinner9310abb2011-10-05 00:59:23 +020012590 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012591}
12592
12593PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012594 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012595\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012596Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012597the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012598separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012599
12600static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012601unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012602{
Victor Stinner9310abb2011-10-05 00:59:23 +020012603 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012604}
12605
Alexander Belopolsky40018472011-02-26 01:02:56 +000012606PyObject *
12607PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012608{
12609 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012610
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012611 s = PyUnicode_FromObject(s);
12612 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012613 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 if (sep != NULL) {
12615 sep = PyUnicode_FromObject(sep);
12616 if (sep == NULL) {
12617 Py_DECREF(s);
12618 return NULL;
12619 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012620 }
12621
Victor Stinner9310abb2011-10-05 00:59:23 +020012622 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012623
12624 Py_DECREF(s);
12625 Py_XDECREF(sep);
12626 return result;
12627}
12628
12629PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012630 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012631\n\
12632Return a list of the words in S, using sep as the\n\
12633delimiter string, starting at the end of the string and\n\
12634working to the front. If maxsplit is given, at most maxsplit\n\
12635splits are done. If sep is not specified, any whitespace string\n\
12636is a separator.");
12637
12638static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012639unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012640{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012641 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012642 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012643 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012644
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012645 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12646 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012647 return NULL;
12648
12649 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012651 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012652 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012653 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012654 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012655}
12656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012657PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012658 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659\n\
12660Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012661Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012662is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663
12664static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012665unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012667 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012668 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012670 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12671 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672 return NULL;
12673
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012674 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675}
12676
12677static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012678PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012680 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681}
12682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012683PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685\n\
12686Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012687and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688
12689static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012690unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012692 if (PyUnicode_READY(self) == -1)
12693 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012694 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695}
12696
Larry Hastings31826802013-10-19 00:09:25 -070012697/*[clinic]
12698module str
Georg Brandlceee0772007-11-27 23:48:05 +000012699
Larry Hastings31826802013-10-19 00:09:25 -070012700@staticmethod
12701str.maketrans as unicode_maketrans
12702
12703 x: object
12704
12705 y: unicode=NULL
12706
12707 z: unicode=NULL
12708
12709 /
12710
12711Return a translation table usable for str.translate().
12712
12713If there is only one argument, it must be a dictionary mapping Unicode
12714ordinals (integers) or characters to Unicode ordinals, strings or None.
12715Character keys will be then converted to ordinals.
12716If there are two arguments, they must be strings of equal length, and
12717in the resulting dictionary, each character in x will be mapped to the
12718character at the same position in y. If there is a third argument, it
12719must be a string, whose characters will be mapped to None in the result.
12720[clinic]*/
12721
12722PyDoc_STRVAR(unicode_maketrans__doc__,
12723"Return a translation table usable for str.translate().\n"
12724"\n"
12725"str.maketrans(x, y=None, z=None)\n"
12726"\n"
12727"If there is only one argument, it must be a dictionary mapping Unicode\n"
12728"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12729"Character keys will be then converted to ordinals.\n"
12730"If there are two arguments, they must be strings of equal length, and\n"
12731"in the resulting dictionary, each character in x will be mapped to the\n"
12732"character at the same position in y. If there is a third argument, it\n"
12733"must be a string, whose characters will be mapped to None in the result.");
12734
12735#define UNICODE_MAKETRANS_METHODDEF \
12736 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12737
12738static PyObject *
12739unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
12740
12741static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012742unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012743{
Larry Hastings31826802013-10-19 00:09:25 -070012744 PyObject *return_value = NULL;
12745 PyObject *x;
12746 PyObject *y = NULL;
12747 PyObject *z = NULL;
12748
12749 if (!PyArg_ParseTuple(args,
12750 "O|UU:maketrans",
12751 &x, &y, &z))
12752 goto exit;
12753 return_value = unicode_maketrans_impl(x, y, z);
12754
12755exit:
12756 return return_value;
12757}
12758
12759static PyObject *
12760unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12761/*[clinic checksum: 137db9c3199e7906b7967009f511c24fa3235b5f]*/
12762{
Georg Brandlceee0772007-11-27 23:48:05 +000012763 PyObject *new = NULL, *key, *value;
12764 Py_ssize_t i = 0;
12765 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012766
Georg Brandlceee0772007-11-27 23:48:05 +000012767 new = PyDict_New();
12768 if (!new)
12769 return NULL;
12770 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771 int x_kind, y_kind, z_kind;
12772 void *x_data, *y_data, *z_data;
12773
Georg Brandlceee0772007-11-27 23:48:05 +000012774 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012775 if (!PyUnicode_Check(x)) {
12776 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12777 "be a string if there is a second argument");
12778 goto err;
12779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012781 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12782 "arguments must have equal length");
12783 goto err;
12784 }
12785 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 x_kind = PyUnicode_KIND(x);
12787 y_kind = PyUnicode_KIND(y);
12788 x_data = PyUnicode_DATA(x);
12789 y_data = PyUnicode_DATA(y);
12790 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12791 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012792 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012793 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012794 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012795 if (!value) {
12796 Py_DECREF(key);
12797 goto err;
12798 }
Georg Brandlceee0772007-11-27 23:48:05 +000012799 res = PyDict_SetItem(new, key, value);
12800 Py_DECREF(key);
12801 Py_DECREF(value);
12802 if (res < 0)
12803 goto err;
12804 }
12805 /* create entries for deleting chars in z */
12806 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 z_kind = PyUnicode_KIND(z);
12808 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012809 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012811 if (!key)
12812 goto err;
12813 res = PyDict_SetItem(new, key, Py_None);
12814 Py_DECREF(key);
12815 if (res < 0)
12816 goto err;
12817 }
12818 }
12819 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 int kind;
12821 void *data;
12822
Georg Brandlceee0772007-11-27 23:48:05 +000012823 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012824 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012825 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12826 "to maketrans it must be a dict");
12827 goto err;
12828 }
12829 /* copy entries into the new dict, converting string keys to int keys */
12830 while (PyDict_Next(x, &i, &key, &value)) {
12831 if (PyUnicode_Check(key)) {
12832 /* convert string keys to integer keys */
12833 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012834 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012835 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12836 "table must be of length 1");
12837 goto err;
12838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 kind = PyUnicode_KIND(key);
12840 data = PyUnicode_DATA(key);
12841 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012842 if (!newkey)
12843 goto err;
12844 res = PyDict_SetItem(new, newkey, value);
12845 Py_DECREF(newkey);
12846 if (res < 0)
12847 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012848 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012849 /* just keep integer keys */
12850 if (PyDict_SetItem(new, key, value) < 0)
12851 goto err;
12852 } else {
12853 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12854 "be strings or integers");
12855 goto err;
12856 }
12857 }
12858 }
12859 return new;
12860 err:
12861 Py_DECREF(new);
12862 return NULL;
12863}
12864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012865PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867\n\
12868Return a copy of the string S, where all characters have been mapped\n\
12869through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012870Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012871Unmapped characters are left untouched. Characters mapped to None\n\
12872are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873
12874static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878}
12879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012880PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012881 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012883Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884
12885static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012886unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012888 if (PyUnicode_READY(self) == -1)
12889 return NULL;
12890 if (PyUnicode_IS_ASCII(self))
12891 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012892 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893}
12894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012895PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012896 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012898Pad a numeric string S with zeros on the left, to fill a field\n\
12899of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900
12901static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012902unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012904 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012905 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012906 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 int kind;
12908 void *data;
12909 Py_UCS4 chr;
12910
Martin v. Löwis18e16552006-02-15 17:27:45 +000012911 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912 return NULL;
12913
Benjamin Petersonbac79492012-01-14 13:34:47 -050012914 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916
Victor Stinnerc4b49542011-12-11 22:44:26 +010012917 if (PyUnicode_GET_LENGTH(self) >= width)
12918 return unicode_result_unchanged(self);
12919
12920 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921
12922 u = pad(self, fill, 0, '0');
12923
Walter Dörwald068325e2002-04-15 13:36:47 +000012924 if (u == NULL)
12925 return NULL;
12926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 kind = PyUnicode_KIND(u);
12928 data = PyUnicode_DATA(u);
12929 chr = PyUnicode_READ(kind, data, fill);
12930
12931 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 PyUnicode_WRITE(kind, data, 0, chr);
12934 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935 }
12936
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012937 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012938 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012940
12941#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012942static PyObject *
12943unicode__decimal2ascii(PyObject *self)
12944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012946}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947#endif
12948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012949PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012952Return True if S starts with the specified prefix, False otherwise.\n\
12953With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012954With optional end, stop comparing S at that position.\n\
12955prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956
12957static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012958unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012959 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012961 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012962 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012963 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012964 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012965 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012966
Jesus Ceaac451502011-04-20 17:09:23 +020012967 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012969 if (PyTuple_Check(subobj)) {
12970 Py_ssize_t i;
12971 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012972 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012973 if (substring == NULL)
12974 return NULL;
12975 result = tailmatch(self, substring, start, end, -1);
12976 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012977 if (result == -1)
12978 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012979 if (result) {
12980 Py_RETURN_TRUE;
12981 }
12982 }
12983 /* nothing matched */
12984 Py_RETURN_FALSE;
12985 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012986 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012987 if (substring == NULL) {
12988 if (PyErr_ExceptionMatches(PyExc_TypeError))
12989 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12990 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012992 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012993 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012995 if (result == -1)
12996 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012997 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012998}
12999
13000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013001PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013002 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013003\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013004Return True if S ends with the specified suffix, False otherwise.\n\
13005With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013006With optional end, stop comparing S at that position.\n\
13007suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008
13009static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013010unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013011 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013012{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013013 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013014 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013015 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013016 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013017 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018
Jesus Ceaac451502011-04-20 17:09:23 +020013019 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013020 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013021 if (PyTuple_Check(subobj)) {
13022 Py_ssize_t i;
13023 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013024 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013025 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013026 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013028 result = tailmatch(self, substring, start, end, +1);
13029 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013030 if (result == -1)
13031 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013032 if (result) {
13033 Py_RETURN_TRUE;
13034 }
13035 }
13036 Py_RETURN_FALSE;
13037 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013038 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013039 if (substring == NULL) {
13040 if (PyErr_ExceptionMatches(PyExc_TypeError))
13041 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13042 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013043 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013044 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013045 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013046 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013047 if (result == -1)
13048 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013049 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050}
13051
Victor Stinner202fdca2012-05-07 12:47:02 +020013052Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013053_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013054{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013055 if (!writer->readonly)
13056 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13057 else {
13058 /* Copy-on-write mode: set buffer size to 0 so
13059 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13060 * next write. */
13061 writer->size = 0;
13062 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013063 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13064 writer->data = PyUnicode_DATA(writer->buffer);
13065 writer->kind = PyUnicode_KIND(writer->buffer);
13066}
13067
Victor Stinnerd3f08822012-05-29 12:57:52 +020013068void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013069_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013070{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013071 memset(writer, 0, sizeof(*writer));
13072#ifdef Py_DEBUG
13073 writer->kind = 5; /* invalid kind */
13074#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013075 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013076}
13077
Victor Stinnerd3f08822012-05-29 12:57:52 +020013078int
13079_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13080 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013081{
13082 Py_ssize_t newlen;
13083 PyObject *newbuffer;
13084
Victor Stinnerd3f08822012-05-29 12:57:52 +020013085 assert(length > 0);
13086
Victor Stinner202fdca2012-05-07 12:47:02 +020013087 if (length > PY_SSIZE_T_MAX - writer->pos) {
13088 PyErr_NoMemory();
13089 return -1;
13090 }
13091 newlen = writer->pos + length;
13092
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013093 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013094
Victor Stinnerd3f08822012-05-29 12:57:52 +020013095 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013096 assert(!writer->readonly);
13097 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013098 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013099 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013100 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013101 if (newlen < writer->min_length)
13102 newlen = writer->min_length;
13103
Victor Stinnerd3f08822012-05-29 12:57:52 +020013104 writer->buffer = PyUnicode_New(newlen, maxchar);
13105 if (writer->buffer == NULL)
13106 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013107 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013108 else if (newlen > writer->size) {
13109 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013110 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013111 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013112 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013113 if (newlen < writer->min_length)
13114 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013115
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013116 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013117 /* resize + widen */
13118 newbuffer = PyUnicode_New(newlen, maxchar);
13119 if (newbuffer == NULL)
13120 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013121 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13122 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013123 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013124 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013125 }
13126 else {
13127 newbuffer = resize_compact(writer->buffer, newlen);
13128 if (newbuffer == NULL)
13129 return -1;
13130 }
13131 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013132 }
13133 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013134 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013135 newbuffer = PyUnicode_New(writer->size, maxchar);
13136 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013137 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013138 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13139 writer->buffer, 0, writer->pos);
13140 Py_DECREF(writer->buffer);
13141 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013142 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013143 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013144 return 0;
13145}
13146
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013147Py_LOCAL_INLINE(int)
13148_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013149{
13150 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13151 return -1;
13152 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13153 writer->pos++;
13154 return 0;
13155}
13156
13157int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013158_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13159{
13160 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13161}
13162
13163int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013164_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13165{
13166 Py_UCS4 maxchar;
13167 Py_ssize_t len;
13168
13169 if (PyUnicode_READY(str) == -1)
13170 return -1;
13171 len = PyUnicode_GET_LENGTH(str);
13172 if (len == 0)
13173 return 0;
13174 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13175 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013176 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013177 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013178 Py_INCREF(str);
13179 writer->buffer = str;
13180 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013181 writer->pos += len;
13182 return 0;
13183 }
13184 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13185 return -1;
13186 }
13187 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13188 str, 0, len);
13189 writer->pos += len;
13190 return 0;
13191}
13192
Victor Stinnere215d962012-10-06 23:03:36 +020013193int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013194_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13195 Py_ssize_t start, Py_ssize_t end)
13196{
13197 Py_UCS4 maxchar;
13198 Py_ssize_t len;
13199
13200 if (PyUnicode_READY(str) == -1)
13201 return -1;
13202
13203 assert(0 <= start);
13204 assert(end <= PyUnicode_GET_LENGTH(str));
13205 assert(start <= end);
13206
13207 if (end == 0)
13208 return 0;
13209
13210 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13211 return _PyUnicodeWriter_WriteStr(writer, str);
13212
13213 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13214 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13215 else
13216 maxchar = writer->maxchar;
13217 len = end - start;
13218
13219 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13220 return -1;
13221
13222 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13223 str, start, len);
13224 writer->pos += len;
13225 return 0;
13226}
13227
13228int
Victor Stinnere215d962012-10-06 23:03:36 +020013229_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13230{
13231 Py_UCS4 maxchar;
13232
13233 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13234 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13235 return -1;
13236 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13237 writer->pos += len;
13238 return 0;
13239}
13240
Victor Stinnerd3f08822012-05-29 12:57:52 +020013241PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013242_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013243{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013244 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013245 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013246 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013247 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013248 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013249 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013250 str = writer->buffer;
13251 writer->buffer = NULL;
13252 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13253 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013254 }
13255 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13256 PyObject *newbuffer;
13257 newbuffer = resize_compact(writer->buffer, writer->pos);
13258 if (newbuffer == NULL) {
13259 Py_DECREF(writer->buffer);
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013260 writer->buffer = NULL;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013261 return NULL;
13262 }
13263 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013264 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013265 str = writer->buffer;
13266 writer->buffer = NULL;
13267 assert(_PyUnicode_CheckConsistency(str, 1));
13268 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013269}
13270
Victor Stinnerd3f08822012-05-29 12:57:52 +020013271void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013272_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013273{
13274 Py_CLEAR(writer->buffer);
13275}
13276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013278
13279PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013281\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013282Return a formatted version of S, using substitutions from args and kwargs.\n\
13283The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013284
Eric Smith27bbca62010-11-04 17:06:58 +000013285PyDoc_STRVAR(format_map__doc__,
13286 "S.format_map(mapping) -> str\n\
13287\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013288Return a formatted version of S, using substitutions from mapping.\n\
13289The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013290
Eric Smith4a7d76d2008-05-30 18:10:19 +000013291static PyObject *
13292unicode__format__(PyObject* self, PyObject* args)
13293{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013294 PyObject *format_spec;
13295 _PyUnicodeWriter writer;
13296 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013297
13298 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13299 return NULL;
13300
Victor Stinnerd3f08822012-05-29 12:57:52 +020013301 if (PyUnicode_READY(self) == -1)
13302 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013303 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013304 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13305 self, format_spec, 0,
13306 PyUnicode_GET_LENGTH(format_spec));
13307 if (ret == -1) {
13308 _PyUnicodeWriter_Dealloc(&writer);
13309 return NULL;
13310 }
13311 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013312}
13313
Eric Smith8c663262007-08-25 02:26:07 +000013314PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013315 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013316\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013317Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013318
13319static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013320unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 Py_ssize_t size;
13323
13324 /* If it's a compact object, account for base structure +
13325 character data. */
13326 if (PyUnicode_IS_COMPACT_ASCII(v))
13327 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13328 else if (PyUnicode_IS_COMPACT(v))
13329 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013330 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 else {
13332 /* If it is a two-block object, account for base object, and
13333 for character block if present. */
13334 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013335 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013337 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 }
13339 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013340 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013341 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013342 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013343 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013344 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013345
13346 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013347}
13348
13349PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013350 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013351
13352static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013353unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013354{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013355 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013356 if (!copy)
13357 return NULL;
13358 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013359}
13360
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013362 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013363 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013364 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13365 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013366 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13367 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013368 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013369 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13370 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13371 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13372 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13373 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013374 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013375 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13376 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13377 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013378 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013379 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13380 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13381 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013382 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013383 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013384 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013385 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013386 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13387 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13388 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13389 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13390 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13391 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13392 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13393 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13394 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13395 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13396 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13397 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13398 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13399 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013400 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013401 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013402 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013403 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013404 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013405 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013406 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013407 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013408#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013409 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013410 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411#endif
13412
Benjamin Peterson14339b62009-01-31 16:36:08 +000013413 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414 {NULL, NULL}
13415};
13416
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013417static PyObject *
13418unicode_mod(PyObject *v, PyObject *w)
13419{
Brian Curtindfc80e32011-08-10 20:28:54 -050013420 if (!PyUnicode_Check(v))
13421 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013423}
13424
13425static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013426 0, /*nb_add*/
13427 0, /*nb_subtract*/
13428 0, /*nb_multiply*/
13429 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013430};
13431
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013433 (lenfunc) unicode_length, /* sq_length */
13434 PyUnicode_Concat, /* sq_concat */
13435 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13436 (ssizeargfunc) unicode_getitem, /* sq_item */
13437 0, /* sq_slice */
13438 0, /* sq_ass_item */
13439 0, /* sq_ass_slice */
13440 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441};
13442
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013443static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013444unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013446 if (PyUnicode_READY(self) == -1)
13447 return NULL;
13448
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013449 if (PyIndex_Check(item)) {
13450 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013451 if (i == -1 && PyErr_Occurred())
13452 return NULL;
13453 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013454 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013455 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013456 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013457 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013458 PyObject *result;
13459 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013460 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013461 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013465 return NULL;
13466 }
13467
13468 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013469 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013470 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013471 slicelength == PyUnicode_GET_LENGTH(self)) {
13472 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013473 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013474 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013475 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013476 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013477 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013478 src_kind = PyUnicode_KIND(self);
13479 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013480 if (!PyUnicode_IS_ASCII(self)) {
13481 kind_limit = kind_maxchar_limit(src_kind);
13482 max_char = 0;
13483 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13484 ch = PyUnicode_READ(src_kind, src_data, cur);
13485 if (ch > max_char) {
13486 max_char = ch;
13487 if (max_char >= kind_limit)
13488 break;
13489 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013490 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013491 }
Victor Stinner55c99112011-10-13 01:17:06 +020013492 else
13493 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013494 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013495 if (result == NULL)
13496 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013497 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013498 dest_data = PyUnicode_DATA(result);
13499
13500 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013501 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13502 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013503 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013504 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013505 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013506 } else {
13507 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13508 return NULL;
13509 }
13510}
13511
13512static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013513 (lenfunc)unicode_length, /* mp_length */
13514 (binaryfunc)unicode_subscript, /* mp_subscript */
13515 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013516};
13517
Guido van Rossumd57fd912000-03-10 22:53:23 +000013518
Guido van Rossumd57fd912000-03-10 22:53:23 +000013519/* Helpers for PyUnicode_Format() */
13520
Victor Stinnera47082312012-10-04 02:19:54 +020013521struct unicode_formatter_t {
13522 PyObject *args;
13523 int args_owned;
13524 Py_ssize_t arglen, argidx;
13525 PyObject *dict;
13526
13527 enum PyUnicode_Kind fmtkind;
13528 Py_ssize_t fmtcnt, fmtpos;
13529 void *fmtdata;
13530 PyObject *fmtstr;
13531
13532 _PyUnicodeWriter writer;
13533};
13534
13535struct unicode_format_arg_t {
13536 Py_UCS4 ch;
13537 int flags;
13538 Py_ssize_t width;
13539 int prec;
13540 int sign;
13541};
13542
Guido van Rossumd57fd912000-03-10 22:53:23 +000013543static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013544unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013545{
Victor Stinnera47082312012-10-04 02:19:54 +020013546 Py_ssize_t argidx = ctx->argidx;
13547
13548 if (argidx < ctx->arglen) {
13549 ctx->argidx++;
13550 if (ctx->arglen < 0)
13551 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 else
Victor Stinnera47082312012-10-04 02:19:54 +020013553 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013554 }
13555 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013557 return NULL;
13558}
13559
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013560/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013561
Victor Stinnera47082312012-10-04 02:19:54 +020013562/* Format a float into the writer if the writer is not NULL, or into *p_output
13563 otherwise.
13564
13565 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013566static int
Victor Stinnera47082312012-10-04 02:19:54 +020013567formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13568 PyObject **p_output,
13569 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013570{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013571 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013572 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013573 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013574 int prec;
13575 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013576
Guido van Rossumd57fd912000-03-10 22:53:23 +000013577 x = PyFloat_AsDouble(v);
13578 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013579 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013580
Victor Stinnera47082312012-10-04 02:19:54 +020013581 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013582 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013584
Victor Stinnera47082312012-10-04 02:19:54 +020013585 if (arg->flags & F_ALT)
13586 dtoa_flags = Py_DTSF_ALT;
13587 else
13588 dtoa_flags = 0;
13589 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013590 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013591 return -1;
13592 len = strlen(p);
13593 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013594 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13595 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013596 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013597 }
Victor Stinner184252a2012-06-16 02:57:41 +020013598 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013599 writer->pos += len;
13600 }
13601 else
13602 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013603 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013604 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013605}
13606
Victor Stinnerd0880d52012-04-27 23:40:13 +020013607/* formatlong() emulates the format codes d, u, o, x and X, and
13608 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13609 * Python's regular ints.
13610 * Return value: a new PyUnicodeObject*, or NULL if error.
13611 * The output string is of the form
13612 * "-"? ("0x" | "0X")? digit+
13613 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13614 * set in flags. The case of hex digits will be correct,
13615 * There will be at least prec digits, zero-filled on the left if
13616 * necessary to get that many.
13617 * val object to be converted
13618 * flags bitmask of format flags; only F_ALT is looked at
13619 * prec minimum number of digits; 0-fill on left if needed
13620 * type a character in [duoxX]; u acts the same as d
13621 *
13622 * CAUTION: o, x and X conversions on regular ints can never
13623 * produce a '-' sign, but can for Python's unbounded ints.
13624 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013625static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013626formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013627{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013628 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013629 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013630 Py_ssize_t i;
13631 int sign; /* 1 if '-', else 0 */
13632 int len; /* number of characters */
13633 Py_ssize_t llen;
13634 int numdigits; /* len == numnondigits + numdigits */
13635 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013636 int prec = arg->prec;
13637 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013638
Victor Stinnerd0880d52012-04-27 23:40:13 +020013639 /* Avoid exceeding SSIZE_T_MAX */
13640 if (prec > INT_MAX-3) {
13641 PyErr_SetString(PyExc_OverflowError,
13642 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013643 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013644 }
13645
13646 assert(PyLong_Check(val));
13647
13648 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013649 default:
13650 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013651 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013652 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013653 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013654 /* int and int subclasses should print numerically when a numeric */
13655 /* format code is used (see issue18780) */
13656 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013657 break;
13658 case 'o':
13659 numnondigits = 2;
13660 result = PyNumber_ToBase(val, 8);
13661 break;
13662 case 'x':
13663 case 'X':
13664 numnondigits = 2;
13665 result = PyNumber_ToBase(val, 16);
13666 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013667 }
13668 if (!result)
13669 return NULL;
13670
13671 assert(unicode_modifiable(result));
13672 assert(PyUnicode_IS_READY(result));
13673 assert(PyUnicode_IS_ASCII(result));
13674
13675 /* To modify the string in-place, there can only be one reference. */
13676 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013677 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013678 PyErr_BadInternalCall();
13679 return NULL;
13680 }
13681 buf = PyUnicode_DATA(result);
13682 llen = PyUnicode_GET_LENGTH(result);
13683 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013684 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013685 PyErr_SetString(PyExc_ValueError,
13686 "string too large in _PyBytes_FormatLong");
13687 return NULL;
13688 }
13689 len = (int)llen;
13690 sign = buf[0] == '-';
13691 numnondigits += sign;
13692 numdigits = len - numnondigits;
13693 assert(numdigits > 0);
13694
13695 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013696 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013697 (type == 'o' || type == 'x' || type == 'X'))) {
13698 assert(buf[sign] == '0');
13699 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13700 buf[sign+1] == 'o');
13701 numnondigits -= 2;
13702 buf += 2;
13703 len -= 2;
13704 if (sign)
13705 buf[0] = '-';
13706 assert(len == numnondigits + numdigits);
13707 assert(numdigits > 0);
13708 }
13709
13710 /* Fill with leading zeroes to meet minimum width. */
13711 if (prec > numdigits) {
13712 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13713 numnondigits + prec);
13714 char *b1;
13715 if (!r1) {
13716 Py_DECREF(result);
13717 return NULL;
13718 }
13719 b1 = PyBytes_AS_STRING(r1);
13720 for (i = 0; i < numnondigits; ++i)
13721 *b1++ = *buf++;
13722 for (i = 0; i < prec - numdigits; i++)
13723 *b1++ = '0';
13724 for (i = 0; i < numdigits; i++)
13725 *b1++ = *buf++;
13726 *b1 = '\0';
13727 Py_DECREF(result);
13728 result = r1;
13729 buf = PyBytes_AS_STRING(result);
13730 len = numnondigits + prec;
13731 }
13732
13733 /* Fix up case for hex conversions. */
13734 if (type == 'X') {
13735 /* Need to convert all lower case letters to upper case.
13736 and need to convert 0x to 0X (and -0x to -0X). */
13737 for (i = 0; i < len; i++)
13738 if (buf[i] >= 'a' && buf[i] <= 'x')
13739 buf[i] -= 'a'-'A';
13740 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013741 if (!PyUnicode_Check(result)
13742 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013743 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013744 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013745 Py_DECREF(result);
13746 result = unicode;
13747 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013748 else if (len != PyUnicode_GET_LENGTH(result)) {
13749 if (PyUnicode_Resize(&result, len) < 0)
13750 Py_CLEAR(result);
13751 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013752 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013753}
13754
Victor Stinner621ef3d2012-10-02 00:33:47 +020013755/* Format an integer.
13756 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013757 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013758 * -1 and raise an exception on error */
13759static int
Victor Stinnera47082312012-10-04 02:19:54 +020013760mainformatlong(PyObject *v,
13761 struct unicode_format_arg_t *arg,
13762 PyObject **p_output,
13763 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013764{
13765 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013766 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013767
13768 if (!PyNumber_Check(v))
13769 goto wrongtype;
13770
13771 if (!PyLong_Check(v)) {
13772 iobj = PyNumber_Long(v);
13773 if (iobj == NULL) {
13774 if (PyErr_ExceptionMatches(PyExc_TypeError))
13775 goto wrongtype;
13776 return -1;
13777 }
13778 assert(PyLong_Check(iobj));
13779 }
13780 else {
13781 iobj = v;
13782 Py_INCREF(iobj);
13783 }
13784
13785 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013786 && arg->width == -1 && arg->prec == -1
13787 && !(arg->flags & (F_SIGN | F_BLANK))
13788 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013789 {
13790 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013791 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013792 int base;
13793
Victor Stinnera47082312012-10-04 02:19:54 +020013794 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013795 {
13796 default:
13797 assert(0 && "'type' not in [diuoxX]");
13798 case 'd':
13799 case 'i':
13800 case 'u':
13801 base = 10;
13802 break;
13803 case 'o':
13804 base = 8;
13805 break;
13806 case 'x':
13807 case 'X':
13808 base = 16;
13809 break;
13810 }
13811
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013812 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13813 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013814 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013815 }
13816 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013817 return 1;
13818 }
13819
Victor Stinnera47082312012-10-04 02:19:54 +020013820 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013821 Py_DECREF(iobj);
13822 if (res == NULL)
13823 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013824 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013825 return 0;
13826
13827wrongtype:
13828 PyErr_Format(PyExc_TypeError,
13829 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013830 "not %.200s",
13831 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013832 return -1;
13833}
13834
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013835static Py_UCS4
13836formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013837{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013838 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013839 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013840 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013841 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013842 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 goto onError;
13844 }
13845 else {
13846 /* Integer input truncated to a character */
13847 long x;
13848 x = PyLong_AsLong(v);
13849 if (x == -1 && PyErr_Occurred())
13850 goto onError;
13851
Victor Stinner8faf8212011-12-08 22:14:11 +010013852 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013853 PyErr_SetString(PyExc_OverflowError,
13854 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013855 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013856 }
13857
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013858 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013859 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013860
Benjamin Peterson29060642009-01-31 22:14:21 +000013861 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013862 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013863 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013864 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013865}
13866
Victor Stinnera47082312012-10-04 02:19:54 +020013867/* Parse options of an argument: flags, width, precision.
13868 Handle also "%(name)" syntax.
13869
13870 Return 0 if the argument has been formatted into arg->str.
13871 Return 1 if the argument has been written into ctx->writer,
13872 Raise an exception and return -1 on error. */
13873static int
13874unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13875 struct unicode_format_arg_t *arg)
13876{
13877#define FORMAT_READ(ctx) \
13878 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13879
13880 PyObject *v;
13881
Victor Stinnera47082312012-10-04 02:19:54 +020013882 if (arg->ch == '(') {
13883 /* Get argument value from a dictionary. Example: "%(name)s". */
13884 Py_ssize_t keystart;
13885 Py_ssize_t keylen;
13886 PyObject *key;
13887 int pcount = 1;
13888
13889 if (ctx->dict == NULL) {
13890 PyErr_SetString(PyExc_TypeError,
13891 "format requires a mapping");
13892 return -1;
13893 }
13894 ++ctx->fmtpos;
13895 --ctx->fmtcnt;
13896 keystart = ctx->fmtpos;
13897 /* Skip over balanced parentheses */
13898 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13899 arg->ch = FORMAT_READ(ctx);
13900 if (arg->ch == ')')
13901 --pcount;
13902 else if (arg->ch == '(')
13903 ++pcount;
13904 ctx->fmtpos++;
13905 }
13906 keylen = ctx->fmtpos - keystart - 1;
13907 if (ctx->fmtcnt < 0 || pcount > 0) {
13908 PyErr_SetString(PyExc_ValueError,
13909 "incomplete format key");
13910 return -1;
13911 }
13912 key = PyUnicode_Substring(ctx->fmtstr,
13913 keystart, keystart + keylen);
13914 if (key == NULL)
13915 return -1;
13916 if (ctx->args_owned) {
13917 Py_DECREF(ctx->args);
13918 ctx->args_owned = 0;
13919 }
13920 ctx->args = PyObject_GetItem(ctx->dict, key);
13921 Py_DECREF(key);
13922 if (ctx->args == NULL)
13923 return -1;
13924 ctx->args_owned = 1;
13925 ctx->arglen = -1;
13926 ctx->argidx = -2;
13927 }
13928
13929 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013930 while (--ctx->fmtcnt >= 0) {
13931 arg->ch = FORMAT_READ(ctx);
13932 ctx->fmtpos++;
13933 switch (arg->ch) {
13934 case '-': arg->flags |= F_LJUST; continue;
13935 case '+': arg->flags |= F_SIGN; continue;
13936 case ' ': arg->flags |= F_BLANK; continue;
13937 case '#': arg->flags |= F_ALT; continue;
13938 case '0': arg->flags |= F_ZERO; continue;
13939 }
13940 break;
13941 }
13942
13943 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013944 if (arg->ch == '*') {
13945 v = unicode_format_getnextarg(ctx);
13946 if (v == NULL)
13947 return -1;
13948 if (!PyLong_Check(v)) {
13949 PyErr_SetString(PyExc_TypeError,
13950 "* wants int");
13951 return -1;
13952 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013953 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013954 if (arg->width == -1 && PyErr_Occurred())
13955 return -1;
13956 if (arg->width < 0) {
13957 arg->flags |= F_LJUST;
13958 arg->width = -arg->width;
13959 }
13960 if (--ctx->fmtcnt >= 0) {
13961 arg->ch = FORMAT_READ(ctx);
13962 ctx->fmtpos++;
13963 }
13964 }
13965 else if (arg->ch >= '0' && arg->ch <= '9') {
13966 arg->width = arg->ch - '0';
13967 while (--ctx->fmtcnt >= 0) {
13968 arg->ch = FORMAT_READ(ctx);
13969 ctx->fmtpos++;
13970 if (arg->ch < '0' || arg->ch > '9')
13971 break;
13972 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13973 mixing signed and unsigned comparison. Since arg->ch is between
13974 '0' and '9', casting to int is safe. */
13975 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13976 PyErr_SetString(PyExc_ValueError,
13977 "width too big");
13978 return -1;
13979 }
13980 arg->width = arg->width*10 + (arg->ch - '0');
13981 }
13982 }
13983
13984 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013985 if (arg->ch == '.') {
13986 arg->prec = 0;
13987 if (--ctx->fmtcnt >= 0) {
13988 arg->ch = FORMAT_READ(ctx);
13989 ctx->fmtpos++;
13990 }
13991 if (arg->ch == '*') {
13992 v = unicode_format_getnextarg(ctx);
13993 if (v == NULL)
13994 return -1;
13995 if (!PyLong_Check(v)) {
13996 PyErr_SetString(PyExc_TypeError,
13997 "* wants int");
13998 return -1;
13999 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014000 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014001 if (arg->prec == -1 && PyErr_Occurred())
14002 return -1;
14003 if (arg->prec < 0)
14004 arg->prec = 0;
14005 if (--ctx->fmtcnt >= 0) {
14006 arg->ch = FORMAT_READ(ctx);
14007 ctx->fmtpos++;
14008 }
14009 }
14010 else if (arg->ch >= '0' && arg->ch <= '9') {
14011 arg->prec = arg->ch - '0';
14012 while (--ctx->fmtcnt >= 0) {
14013 arg->ch = FORMAT_READ(ctx);
14014 ctx->fmtpos++;
14015 if (arg->ch < '0' || arg->ch > '9')
14016 break;
14017 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14018 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014019 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014020 return -1;
14021 }
14022 arg->prec = arg->prec*10 + (arg->ch - '0');
14023 }
14024 }
14025 }
14026
14027 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14028 if (ctx->fmtcnt >= 0) {
14029 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14030 if (--ctx->fmtcnt >= 0) {
14031 arg->ch = FORMAT_READ(ctx);
14032 ctx->fmtpos++;
14033 }
14034 }
14035 }
14036 if (ctx->fmtcnt < 0) {
14037 PyErr_SetString(PyExc_ValueError,
14038 "incomplete format");
14039 return -1;
14040 }
14041 return 0;
14042
14043#undef FORMAT_READ
14044}
14045
14046/* Format one argument. Supported conversion specifiers:
14047
14048 - "s", "r", "a": any type
14049 - "i", "d", "u", "o", "x", "X": int
14050 - "e", "E", "f", "F", "g", "G": float
14051 - "c": int or str (1 character)
14052
Victor Stinner8dbd4212012-12-04 09:30:24 +010014053 When possible, the output is written directly into the Unicode writer
14054 (ctx->writer). A string is created when padding is required.
14055
Victor Stinnera47082312012-10-04 02:19:54 +020014056 Return 0 if the argument has been formatted into *p_str,
14057 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014058 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014059static int
14060unicode_format_arg_format(struct unicode_formatter_t *ctx,
14061 struct unicode_format_arg_t *arg,
14062 PyObject **p_str)
14063{
14064 PyObject *v;
14065 _PyUnicodeWriter *writer = &ctx->writer;
14066
14067 if (ctx->fmtcnt == 0)
14068 ctx->writer.overallocate = 0;
14069
14070 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014071 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014072 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014073 return 1;
14074 }
14075
14076 v = unicode_format_getnextarg(ctx);
14077 if (v == NULL)
14078 return -1;
14079
Victor Stinnera47082312012-10-04 02:19:54 +020014080
14081 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014082 case 's':
14083 case 'r':
14084 case 'a':
14085 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14086 /* Fast path */
14087 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14088 return -1;
14089 return 1;
14090 }
14091
14092 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14093 *p_str = v;
14094 Py_INCREF(*p_str);
14095 }
14096 else {
14097 if (arg->ch == 's')
14098 *p_str = PyObject_Str(v);
14099 else if (arg->ch == 'r')
14100 *p_str = PyObject_Repr(v);
14101 else
14102 *p_str = PyObject_ASCII(v);
14103 }
14104 break;
14105
14106 case 'i':
14107 case 'd':
14108 case 'u':
14109 case 'o':
14110 case 'x':
14111 case 'X':
14112 {
14113 int ret = mainformatlong(v, arg, p_str, writer);
14114 if (ret != 0)
14115 return ret;
14116 arg->sign = 1;
14117 break;
14118 }
14119
14120 case 'e':
14121 case 'E':
14122 case 'f':
14123 case 'F':
14124 case 'g':
14125 case 'G':
14126 if (arg->width == -1 && arg->prec == -1
14127 && !(arg->flags & (F_SIGN | F_BLANK)))
14128 {
14129 /* Fast path */
14130 if (formatfloat(v, arg, NULL, writer) == -1)
14131 return -1;
14132 return 1;
14133 }
14134
14135 arg->sign = 1;
14136 if (formatfloat(v, arg, p_str, NULL) == -1)
14137 return -1;
14138 break;
14139
14140 case 'c':
14141 {
14142 Py_UCS4 ch = formatchar(v);
14143 if (ch == (Py_UCS4) -1)
14144 return -1;
14145 if (arg->width == -1 && arg->prec == -1) {
14146 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014147 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014148 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014149 return 1;
14150 }
14151 *p_str = PyUnicode_FromOrdinal(ch);
14152 break;
14153 }
14154
14155 default:
14156 PyErr_Format(PyExc_ValueError,
14157 "unsupported format character '%c' (0x%x) "
14158 "at index %zd",
14159 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14160 (int)arg->ch,
14161 ctx->fmtpos - 1);
14162 return -1;
14163 }
14164 if (*p_str == NULL)
14165 return -1;
14166 assert (PyUnicode_Check(*p_str));
14167 return 0;
14168}
14169
14170static int
14171unicode_format_arg_output(struct unicode_formatter_t *ctx,
14172 struct unicode_format_arg_t *arg,
14173 PyObject *str)
14174{
14175 Py_ssize_t len;
14176 enum PyUnicode_Kind kind;
14177 void *pbuf;
14178 Py_ssize_t pindex;
14179 Py_UCS4 signchar;
14180 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014181 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014182 Py_ssize_t sublen;
14183 _PyUnicodeWriter *writer = &ctx->writer;
14184 Py_UCS4 fill;
14185
14186 fill = ' ';
14187 if (arg->sign && arg->flags & F_ZERO)
14188 fill = '0';
14189
14190 if (PyUnicode_READY(str) == -1)
14191 return -1;
14192
14193 len = PyUnicode_GET_LENGTH(str);
14194 if ((arg->width == -1 || arg->width <= len)
14195 && (arg->prec == -1 || arg->prec >= len)
14196 && !(arg->flags & (F_SIGN | F_BLANK)))
14197 {
14198 /* Fast path */
14199 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14200 return -1;
14201 return 0;
14202 }
14203
14204 /* Truncate the string for "s", "r" and "a" formats
14205 if the precision is set */
14206 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14207 if (arg->prec >= 0 && len > arg->prec)
14208 len = arg->prec;
14209 }
14210
14211 /* Adjust sign and width */
14212 kind = PyUnicode_KIND(str);
14213 pbuf = PyUnicode_DATA(str);
14214 pindex = 0;
14215 signchar = '\0';
14216 if (arg->sign) {
14217 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14218 if (ch == '-' || ch == '+') {
14219 signchar = ch;
14220 len--;
14221 pindex++;
14222 }
14223 else if (arg->flags & F_SIGN)
14224 signchar = '+';
14225 else if (arg->flags & F_BLANK)
14226 signchar = ' ';
14227 else
14228 arg->sign = 0;
14229 }
14230 if (arg->width < len)
14231 arg->width = len;
14232
14233 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014234 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014235 if (!(arg->flags & F_LJUST)) {
14236 if (arg->sign) {
14237 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014238 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014239 }
14240 else {
14241 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014242 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014243 }
14244 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014245 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14246 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014247 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014248 }
14249
Victor Stinnera47082312012-10-04 02:19:54 +020014250 buflen = arg->width;
14251 if (arg->sign && len == arg->width)
14252 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014253 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014254 return -1;
14255
14256 /* Write the sign if needed */
14257 if (arg->sign) {
14258 if (fill != ' ') {
14259 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14260 writer->pos += 1;
14261 }
14262 if (arg->width > len)
14263 arg->width--;
14264 }
14265
14266 /* Write the numeric prefix for "x", "X" and "o" formats
14267 if the alternate form is used.
14268 For example, write "0x" for the "%#x" format. */
14269 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14270 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14271 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14272 if (fill != ' ') {
14273 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14274 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14275 writer->pos += 2;
14276 pindex += 2;
14277 }
14278 arg->width -= 2;
14279 if (arg->width < 0)
14280 arg->width = 0;
14281 len -= 2;
14282 }
14283
14284 /* Pad left with the fill character if needed */
14285 if (arg->width > len && !(arg->flags & F_LJUST)) {
14286 sublen = arg->width - len;
14287 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14288 writer->pos += sublen;
14289 arg->width = len;
14290 }
14291
14292 /* If padding with spaces: write sign if needed and/or numeric prefix if
14293 the alternate form is used */
14294 if (fill == ' ') {
14295 if (arg->sign) {
14296 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14297 writer->pos += 1;
14298 }
14299 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14300 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14301 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14302 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14303 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14304 writer->pos += 2;
14305 pindex += 2;
14306 }
14307 }
14308
14309 /* Write characters */
14310 if (len) {
14311 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14312 str, pindex, len);
14313 writer->pos += len;
14314 }
14315
14316 /* Pad right with the fill character if needed */
14317 if (arg->width > len) {
14318 sublen = arg->width - len;
14319 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14320 writer->pos += sublen;
14321 }
14322 return 0;
14323}
14324
14325/* Helper of PyUnicode_Format(): format one arg.
14326 Return 0 on success, raise an exception and return -1 on error. */
14327static int
14328unicode_format_arg(struct unicode_formatter_t *ctx)
14329{
14330 struct unicode_format_arg_t arg;
14331 PyObject *str;
14332 int ret;
14333
Victor Stinner8dbd4212012-12-04 09:30:24 +010014334 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14335 arg.flags = 0;
14336 arg.width = -1;
14337 arg.prec = -1;
14338 arg.sign = 0;
14339 str = NULL;
14340
Victor Stinnera47082312012-10-04 02:19:54 +020014341 ret = unicode_format_arg_parse(ctx, &arg);
14342 if (ret == -1)
14343 return -1;
14344
14345 ret = unicode_format_arg_format(ctx, &arg, &str);
14346 if (ret == -1)
14347 return -1;
14348
14349 if (ret != 1) {
14350 ret = unicode_format_arg_output(ctx, &arg, str);
14351 Py_DECREF(str);
14352 if (ret == -1)
14353 return -1;
14354 }
14355
14356 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14357 PyErr_SetString(PyExc_TypeError,
14358 "not all arguments converted during string formatting");
14359 return -1;
14360 }
14361 return 0;
14362}
14363
Alexander Belopolsky40018472011-02-26 01:02:56 +000014364PyObject *
14365PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014366{
Victor Stinnera47082312012-10-04 02:19:54 +020014367 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014368
Guido van Rossumd57fd912000-03-10 22:53:23 +000014369 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014370 PyErr_BadInternalCall();
14371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014372 }
Victor Stinnera47082312012-10-04 02:19:54 +020014373
14374 ctx.fmtstr = PyUnicode_FromObject(format);
14375 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014376 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014377 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14378 Py_DECREF(ctx.fmtstr);
14379 return NULL;
14380 }
14381 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14382 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14383 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14384 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014385
Victor Stinner8f674cc2013-04-17 23:02:17 +020014386 _PyUnicodeWriter_Init(&ctx.writer);
14387 ctx.writer.min_length = ctx.fmtcnt + 100;
14388 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014389
Guido van Rossumd57fd912000-03-10 22:53:23 +000014390 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014391 ctx.arglen = PyTuple_Size(args);
14392 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014393 }
14394 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014395 ctx.arglen = -1;
14396 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014397 }
Victor Stinnera47082312012-10-04 02:19:54 +020014398 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014399 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014400 ctx.dict = args;
14401 else
14402 ctx.dict = NULL;
14403 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014404
Victor Stinnera47082312012-10-04 02:19:54 +020014405 while (--ctx.fmtcnt >= 0) {
14406 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014407 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014408
14409 nonfmtpos = ctx.fmtpos++;
14410 while (ctx.fmtcnt >= 0 &&
14411 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14412 ctx.fmtpos++;
14413 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014414 }
Victor Stinnera47082312012-10-04 02:19:54 +020014415 if (ctx.fmtcnt < 0) {
14416 ctx.fmtpos--;
14417 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014418 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014419
Victor Stinnercfc4c132013-04-03 01:48:39 +020014420 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14421 nonfmtpos, ctx.fmtpos) < 0)
14422 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014423 }
14424 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014425 ctx.fmtpos++;
14426 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014427 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014428 }
14429 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014430
Victor Stinnera47082312012-10-04 02:19:54 +020014431 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014432 PyErr_SetString(PyExc_TypeError,
14433 "not all arguments converted during string formatting");
14434 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014435 }
14436
Victor Stinnera47082312012-10-04 02:19:54 +020014437 if (ctx.args_owned) {
14438 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014439 }
Victor Stinnera47082312012-10-04 02:19:54 +020014440 Py_DECREF(ctx.fmtstr);
14441 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014442
Benjamin Peterson29060642009-01-31 22:14:21 +000014443 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014444 Py_DECREF(ctx.fmtstr);
14445 _PyUnicodeWriter_Dealloc(&ctx.writer);
14446 if (ctx.args_owned) {
14447 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014448 }
14449 return NULL;
14450}
14451
Jeremy Hylton938ace62002-07-17 16:30:39 +000014452static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014453unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14454
Tim Peters6d6c1a32001-08-02 04:15:00 +000014455static PyObject *
14456unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14457{
Benjamin Peterson29060642009-01-31 22:14:21 +000014458 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014459 static char *kwlist[] = {"object", "encoding", "errors", 0};
14460 char *encoding = NULL;
14461 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014462
Benjamin Peterson14339b62009-01-31 16:36:08 +000014463 if (type != &PyUnicode_Type)
14464 return unicode_subtype_new(type, args, kwds);
14465 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014466 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014467 return NULL;
14468 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014469 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014470 if (encoding == NULL && errors == NULL)
14471 return PyObject_Str(x);
14472 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014473 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014474}
14475
Guido van Rossume023fe02001-08-30 03:12:59 +000014476static PyObject *
14477unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14478{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014479 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014480 Py_ssize_t length, char_size;
14481 int share_wstr, share_utf8;
14482 unsigned int kind;
14483 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014484
Benjamin Peterson14339b62009-01-31 16:36:08 +000014485 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014486
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014487 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014488 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014489 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014490 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014491 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014492 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014493 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014494 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014495
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014496 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014497 if (self == NULL) {
14498 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014499 return NULL;
14500 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014501 kind = PyUnicode_KIND(unicode);
14502 length = PyUnicode_GET_LENGTH(unicode);
14503
14504 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014505#ifdef Py_DEBUG
14506 _PyUnicode_HASH(self) = -1;
14507#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014508 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014509#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014510 _PyUnicode_STATE(self).interned = 0;
14511 _PyUnicode_STATE(self).kind = kind;
14512 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014513 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014514 _PyUnicode_STATE(self).ready = 1;
14515 _PyUnicode_WSTR(self) = NULL;
14516 _PyUnicode_UTF8_LENGTH(self) = 0;
14517 _PyUnicode_UTF8(self) = NULL;
14518 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014519 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014520
14521 share_utf8 = 0;
14522 share_wstr = 0;
14523 if (kind == PyUnicode_1BYTE_KIND) {
14524 char_size = 1;
14525 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14526 share_utf8 = 1;
14527 }
14528 else if (kind == PyUnicode_2BYTE_KIND) {
14529 char_size = 2;
14530 if (sizeof(wchar_t) == 2)
14531 share_wstr = 1;
14532 }
14533 else {
14534 assert(kind == PyUnicode_4BYTE_KIND);
14535 char_size = 4;
14536 if (sizeof(wchar_t) == 4)
14537 share_wstr = 1;
14538 }
14539
14540 /* Ensure we won't overflow the length. */
14541 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14542 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014543 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014544 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014545 data = PyObject_MALLOC((length + 1) * char_size);
14546 if (data == NULL) {
14547 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014548 goto onError;
14549 }
14550
Victor Stinnerc3c74152011-10-02 20:39:55 +020014551 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014552 if (share_utf8) {
14553 _PyUnicode_UTF8_LENGTH(self) = length;
14554 _PyUnicode_UTF8(self) = data;
14555 }
14556 if (share_wstr) {
14557 _PyUnicode_WSTR_LENGTH(self) = length;
14558 _PyUnicode_WSTR(self) = (wchar_t *)data;
14559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014560
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014561 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014562 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014563 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014564#ifdef Py_DEBUG
14565 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14566#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014567 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014568 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014569
14570onError:
14571 Py_DECREF(unicode);
14572 Py_DECREF(self);
14573 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014574}
14575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014576PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014577"str(object='') -> str\n\
14578str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014579\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014580Create a new string object from the given object. If encoding or\n\
14581errors is specified, then the object must expose a data buffer\n\
14582that will be decoded using the given encoding and error handler.\n\
14583Otherwise, returns the result of object.__str__() (if defined)\n\
14584or repr(object).\n\
14585encoding defaults to sys.getdefaultencoding().\n\
14586errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014587
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014588static PyObject *unicode_iter(PyObject *seq);
14589
Guido van Rossumd57fd912000-03-10 22:53:23 +000014590PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014591 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014592 "str", /* tp_name */
14593 sizeof(PyUnicodeObject), /* tp_size */
14594 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014595 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014596 (destructor)unicode_dealloc, /* tp_dealloc */
14597 0, /* tp_print */
14598 0, /* tp_getattr */
14599 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014600 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014601 unicode_repr, /* tp_repr */
14602 &unicode_as_number, /* tp_as_number */
14603 &unicode_as_sequence, /* tp_as_sequence */
14604 &unicode_as_mapping, /* tp_as_mapping */
14605 (hashfunc) unicode_hash, /* tp_hash*/
14606 0, /* tp_call*/
14607 (reprfunc) unicode_str, /* tp_str */
14608 PyObject_GenericGetAttr, /* tp_getattro */
14609 0, /* tp_setattro */
14610 0, /* tp_as_buffer */
14611 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014612 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014613 unicode_doc, /* tp_doc */
14614 0, /* tp_traverse */
14615 0, /* tp_clear */
14616 PyUnicode_RichCompare, /* tp_richcompare */
14617 0, /* tp_weaklistoffset */
14618 unicode_iter, /* tp_iter */
14619 0, /* tp_iternext */
14620 unicode_methods, /* tp_methods */
14621 0, /* tp_members */
14622 0, /* tp_getset */
14623 &PyBaseObject_Type, /* tp_base */
14624 0, /* tp_dict */
14625 0, /* tp_descr_get */
14626 0, /* tp_descr_set */
14627 0, /* tp_dictoffset */
14628 0, /* tp_init */
14629 0, /* tp_alloc */
14630 unicode_new, /* tp_new */
14631 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014632};
14633
14634/* Initialize the Unicode implementation */
14635
Victor Stinner3a50e702011-10-18 21:21:00 +020014636int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014637{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014638 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014639 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014640 0x000A, /* LINE FEED */
14641 0x000D, /* CARRIAGE RETURN */
14642 0x001C, /* FILE SEPARATOR */
14643 0x001D, /* GROUP SEPARATOR */
14644 0x001E, /* RECORD SEPARATOR */
14645 0x0085, /* NEXT LINE */
14646 0x2028, /* LINE SEPARATOR */
14647 0x2029, /* PARAGRAPH SEPARATOR */
14648 };
14649
Fred Drakee4315f52000-05-09 19:53:39 +000014650 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014651 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014652 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014653 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014654 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014655
Guido van Rossumcacfc072002-05-24 19:01:59 +000014656 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014657 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014658
14659 /* initialize the linebreak bloom filter */
14660 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014661 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014662 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014663
Christian Heimes26532f72013-07-20 14:57:16 +020014664 if (PyType_Ready(&EncodingMapType) < 0)
14665 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014666
Benjamin Petersonc4311282012-10-30 23:21:10 -040014667 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14668 Py_FatalError("Can't initialize field name iterator type");
14669
14670 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14671 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014672
Victor Stinner3a50e702011-10-18 21:21:00 +020014673#ifdef HAVE_MBCS
14674 winver.dwOSVersionInfoSize = sizeof(winver);
14675 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14676 PyErr_SetFromWindowsErr(0);
14677 return -1;
14678 }
14679#endif
14680 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014681}
14682
14683/* Finalize the Unicode implementation */
14684
Christian Heimesa156e092008-02-16 07:38:31 +000014685int
14686PyUnicode_ClearFreeList(void)
14687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014688 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014689}
14690
Guido van Rossumd57fd912000-03-10 22:53:23 +000014691void
Thomas Wouters78890102000-07-22 19:25:51 +000014692_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014693{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014694 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014695
Serhiy Storchaka05997252013-01-26 12:14:02 +020014696 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014697
Serhiy Storchaka05997252013-01-26 12:14:02 +020014698 for (i = 0; i < 256; i++)
14699 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014700 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014701 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014702}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014703
Walter Dörwald16807132007-05-25 13:52:07 +000014704void
14705PyUnicode_InternInPlace(PyObject **p)
14706{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014707 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014708 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014709#ifdef Py_DEBUG
14710 assert(s != NULL);
14711 assert(_PyUnicode_CHECK(s));
14712#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014713 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014714 return;
14715#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014716 /* If it's a subclass, we don't really know what putting
14717 it in the interned dict might do. */
14718 if (!PyUnicode_CheckExact(s))
14719 return;
14720 if (PyUnicode_CHECK_INTERNED(s))
14721 return;
14722 if (interned == NULL) {
14723 interned = PyDict_New();
14724 if (interned == NULL) {
14725 PyErr_Clear(); /* Don't leave an exception */
14726 return;
14727 }
14728 }
14729 /* It might be that the GetItem call fails even
14730 though the key is present in the dictionary,
14731 namely when this happens during a stack overflow. */
14732 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014733 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014734 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014735
Victor Stinnerf0335102013-04-14 19:13:03 +020014736 if (t) {
14737 Py_INCREF(t);
14738 Py_DECREF(*p);
14739 *p = t;
14740 return;
14741 }
Walter Dörwald16807132007-05-25 13:52:07 +000014742
Benjamin Peterson14339b62009-01-31 16:36:08 +000014743 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014744 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014745 PyErr_Clear();
14746 PyThreadState_GET()->recursion_critical = 0;
14747 return;
14748 }
14749 PyThreadState_GET()->recursion_critical = 0;
14750 /* The two references in interned are not counted by refcnt.
14751 The deallocator will take care of this */
14752 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014753 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014754}
14755
14756void
14757PyUnicode_InternImmortal(PyObject **p)
14758{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014759 PyUnicode_InternInPlace(p);
14760 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014761 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014762 Py_INCREF(*p);
14763 }
Walter Dörwald16807132007-05-25 13:52:07 +000014764}
14765
14766PyObject *
14767PyUnicode_InternFromString(const char *cp)
14768{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014769 PyObject *s = PyUnicode_FromString(cp);
14770 if (s == NULL)
14771 return NULL;
14772 PyUnicode_InternInPlace(&s);
14773 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014774}
14775
Alexander Belopolsky40018472011-02-26 01:02:56 +000014776void
14777_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014778{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014779 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014780 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014781 Py_ssize_t i, n;
14782 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014783
Benjamin Peterson14339b62009-01-31 16:36:08 +000014784 if (interned == NULL || !PyDict_Check(interned))
14785 return;
14786 keys = PyDict_Keys(interned);
14787 if (keys == NULL || !PyList_Check(keys)) {
14788 PyErr_Clear();
14789 return;
14790 }
Walter Dörwald16807132007-05-25 13:52:07 +000014791
Benjamin Peterson14339b62009-01-31 16:36:08 +000014792 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14793 detector, interned unicode strings are not forcibly deallocated;
14794 rather, we give them their stolen references back, and then clear
14795 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014796
Benjamin Peterson14339b62009-01-31 16:36:08 +000014797 n = PyList_GET_SIZE(keys);
14798 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014799 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014800 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014801 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014802 if (PyUnicode_READY(s) == -1) {
14803 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014804 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014806 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014807 case SSTATE_NOT_INTERNED:
14808 /* XXX Shouldn't happen */
14809 break;
14810 case SSTATE_INTERNED_IMMORTAL:
14811 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014812 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014813 break;
14814 case SSTATE_INTERNED_MORTAL:
14815 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014816 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014817 break;
14818 default:
14819 Py_FatalError("Inconsistent interned string state.");
14820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014821 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014822 }
14823 fprintf(stderr, "total size of all interned strings: "
14824 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14825 "mortal/immortal\n", mortal_size, immortal_size);
14826 Py_DECREF(keys);
14827 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014828 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014829}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014830
14831
14832/********************* Unicode Iterator **************************/
14833
14834typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014835 PyObject_HEAD
14836 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014837 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014838} unicodeiterobject;
14839
14840static void
14841unicodeiter_dealloc(unicodeiterobject *it)
14842{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014843 _PyObject_GC_UNTRACK(it);
14844 Py_XDECREF(it->it_seq);
14845 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014846}
14847
14848static int
14849unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14850{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014851 Py_VISIT(it->it_seq);
14852 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014853}
14854
14855static PyObject *
14856unicodeiter_next(unicodeiterobject *it)
14857{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014858 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014859
Benjamin Peterson14339b62009-01-31 16:36:08 +000014860 assert(it != NULL);
14861 seq = it->it_seq;
14862 if (seq == NULL)
14863 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014864 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014866 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14867 int kind = PyUnicode_KIND(seq);
14868 void *data = PyUnicode_DATA(seq);
14869 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14870 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014871 if (item != NULL)
14872 ++it->it_index;
14873 return item;
14874 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014875
Benjamin Peterson14339b62009-01-31 16:36:08 +000014876 Py_DECREF(seq);
14877 it->it_seq = NULL;
14878 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014879}
14880
14881static PyObject *
14882unicodeiter_len(unicodeiterobject *it)
14883{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014884 Py_ssize_t len = 0;
14885 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014886 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014887 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014888}
14889
14890PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14891
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014892static PyObject *
14893unicodeiter_reduce(unicodeiterobject *it)
14894{
14895 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014896 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014897 it->it_seq, it->it_index);
14898 } else {
14899 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14900 if (u == NULL)
14901 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014902 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014903 }
14904}
14905
14906PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14907
14908static PyObject *
14909unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14910{
14911 Py_ssize_t index = PyLong_AsSsize_t(state);
14912 if (index == -1 && PyErr_Occurred())
14913 return NULL;
14914 if (index < 0)
14915 index = 0;
14916 it->it_index = index;
14917 Py_RETURN_NONE;
14918}
14919
14920PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14921
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014922static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014923 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014924 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014925 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14926 reduce_doc},
14927 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14928 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014929 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014930};
14931
14932PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014933 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14934 "str_iterator", /* tp_name */
14935 sizeof(unicodeiterobject), /* tp_basicsize */
14936 0, /* tp_itemsize */
14937 /* methods */
14938 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14939 0, /* tp_print */
14940 0, /* tp_getattr */
14941 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014942 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014943 0, /* tp_repr */
14944 0, /* tp_as_number */
14945 0, /* tp_as_sequence */
14946 0, /* tp_as_mapping */
14947 0, /* tp_hash */
14948 0, /* tp_call */
14949 0, /* tp_str */
14950 PyObject_GenericGetAttr, /* tp_getattro */
14951 0, /* tp_setattro */
14952 0, /* tp_as_buffer */
14953 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14954 0, /* tp_doc */
14955 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14956 0, /* tp_clear */
14957 0, /* tp_richcompare */
14958 0, /* tp_weaklistoffset */
14959 PyObject_SelfIter, /* tp_iter */
14960 (iternextfunc)unicodeiter_next, /* tp_iternext */
14961 unicodeiter_methods, /* tp_methods */
14962 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014963};
14964
14965static PyObject *
14966unicode_iter(PyObject *seq)
14967{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014968 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014969
Benjamin Peterson14339b62009-01-31 16:36:08 +000014970 if (!PyUnicode_Check(seq)) {
14971 PyErr_BadInternalCall();
14972 return NULL;
14973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014974 if (PyUnicode_READY(seq) == -1)
14975 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014976 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14977 if (it == NULL)
14978 return NULL;
14979 it->it_index = 0;
14980 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014981 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014982 _PyObject_GC_TRACK(it);
14983 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014984}
14985
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014986
14987size_t
14988Py_UNICODE_strlen(const Py_UNICODE *u)
14989{
14990 int res = 0;
14991 while(*u++)
14992 res++;
14993 return res;
14994}
14995
14996Py_UNICODE*
14997Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14998{
14999 Py_UNICODE *u = s1;
15000 while ((*u++ = *s2++));
15001 return s1;
15002}
15003
15004Py_UNICODE*
15005Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15006{
15007 Py_UNICODE *u = s1;
15008 while ((*u++ = *s2++))
15009 if (n-- == 0)
15010 break;
15011 return s1;
15012}
15013
15014Py_UNICODE*
15015Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15016{
15017 Py_UNICODE *u1 = s1;
15018 u1 += Py_UNICODE_strlen(u1);
15019 Py_UNICODE_strcpy(u1, s2);
15020 return s1;
15021}
15022
15023int
15024Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15025{
15026 while (*s1 && *s2 && *s1 == *s2)
15027 s1++, s2++;
15028 if (*s1 && *s2)
15029 return (*s1 < *s2) ? -1 : +1;
15030 if (*s1)
15031 return 1;
15032 if (*s2)
15033 return -1;
15034 return 0;
15035}
15036
15037int
15038Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15039{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015040 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015041 for (; n != 0; n--) {
15042 u1 = *s1;
15043 u2 = *s2;
15044 if (u1 != u2)
15045 return (u1 < u2) ? -1 : +1;
15046 if (u1 == '\0')
15047 return 0;
15048 s1++;
15049 s2++;
15050 }
15051 return 0;
15052}
15053
15054Py_UNICODE*
15055Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15056{
15057 const Py_UNICODE *p;
15058 for (p = s; *p; p++)
15059 if (*p == c)
15060 return (Py_UNICODE*)p;
15061 return NULL;
15062}
15063
15064Py_UNICODE*
15065Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15066{
15067 const Py_UNICODE *p;
15068 p = s + Py_UNICODE_strlen(s);
15069 while (p != s) {
15070 p--;
15071 if (*p == c)
15072 return (Py_UNICODE*)p;
15073 }
15074 return NULL;
15075}
Victor Stinner331ea922010-08-10 16:37:20 +000015076
Victor Stinner71133ff2010-09-01 23:43:53 +000015077Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015078PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015079{
Victor Stinner577db2c2011-10-11 22:12:48 +020015080 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015081 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015083 if (!PyUnicode_Check(unicode)) {
15084 PyErr_BadArgument();
15085 return NULL;
15086 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015087 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015088 if (u == NULL)
15089 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015090 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015091 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015092 PyErr_NoMemory();
15093 return NULL;
15094 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015095 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015096 size *= sizeof(Py_UNICODE);
15097 copy = PyMem_Malloc(size);
15098 if (copy == NULL) {
15099 PyErr_NoMemory();
15100 return NULL;
15101 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015102 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015103 return copy;
15104}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015105
Georg Brandl66c221e2010-10-14 07:04:07 +000015106/* A _string module, to export formatter_parser and formatter_field_name_split
15107 to the string.Formatter class implemented in Python. */
15108
15109static PyMethodDef _string_methods[] = {
15110 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15111 METH_O, PyDoc_STR("split the argument as a field name")},
15112 {"formatter_parser", (PyCFunction) formatter_parser,
15113 METH_O, PyDoc_STR("parse the argument as a format string")},
15114 {NULL, NULL}
15115};
15116
15117static struct PyModuleDef _string_module = {
15118 PyModuleDef_HEAD_INIT,
15119 "_string",
15120 PyDoc_STR("string helper module"),
15121 0,
15122 _string_methods,
15123 NULL,
15124 NULL,
15125 NULL,
15126 NULL
15127};
15128
15129PyMODINIT_FUNC
15130PyInit__string(void)
15131{
15132 return PyModule_Create(&_string_module);
15133}
15134
15135
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015136#ifdef __cplusplus
15137}
15138#endif