blob: 1375ef3093d7cbc6d19cdc34c22f8c65b4fdfad5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinner910337b2011-10-03 03:20:16 +0200107#undef PyUnicode_READY
108#define PyUnicode_READY(op) \
109 (assert(_PyUnicode_CHECK(op)), \
110 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200111 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100112 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200113
Victor Stinnerc379ead2011-10-03 12:52:27 +0200114#define _PyUnicode_SHARE_UTF8(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
117 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
118#define _PyUnicode_SHARE_WSTR(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
121
Victor Stinner829c0ad2011-10-03 01:08:02 +0200122/* true if the Unicode object has an allocated UTF-8 memory block
123 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200125 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200126 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
128
Victor Stinner03490912011-10-03 23:45:12 +0200129/* true if the Unicode object has an allocated wstr memory block
130 (not shared with other data) */
131#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200132 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200133 (!PyUnicode_IS_READY(op) || \
134 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
135
Victor Stinner910337b2011-10-03 03:20:16 +0200136/* Generic helper macro to convert characters of different types.
137 from_type and to_type have to be valid type names, begin and end
138 are pointers to the source characters which should be of type
139 "from_type *". to is a pointer of type "to_type *" and points to the
140 buffer where the result characters are written to. */
141#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
142 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200143 to_type *_to = (to_type *) to; \
144 const from_type *_iter = (begin); \
145 const from_type *_end = (end); \
146 Py_ssize_t n = (_end) - (_iter); \
147 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200148 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 while (_iter < (_unrolled_end)) { \
150 _to[0] = (to_type) _iter[0]; \
151 _to[1] = (to_type) _iter[1]; \
152 _to[2] = (to_type) _iter[2]; \
153 _to[3] = (to_type) _iter[3]; \
154 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200155 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_end)) \
157 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200158 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159
Walter Dörwald16807132007-05-25 13:52:07 +0000160/* This dictionary holds all interned unicode strings. Note that references
161 to strings in this dictionary are *not* counted in the string's ob_refcnt.
162 When the interned string reaches a refcnt of 0 the string deallocation
163 function will delete the reference from this dictionary.
164
165 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000166 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000167*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200168static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000169
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000170/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200171static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200172
Serhiy Storchaka678db842013-01-26 12:16:36 +0200173#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200174 do { \
175 if (unicode_empty != NULL) \
176 Py_INCREF(unicode_empty); \
177 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178 unicode_empty = PyUnicode_New(0, 0); \
179 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200180 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
182 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186#define _Py_RETURN_UNICODE_EMPTY() \
187 do { \
188 _Py_INCREF_UNICODE_EMPTY(); \
189 return unicode_empty; \
190 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200192/* Forward declaration */
193Py_LOCAL_INLINE(int)
194_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
195
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200196/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200197static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* Single character Unicode strings in the Latin-1 range are being
200 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202
Christian Heimes190d79e2008-01-30 11:58:22 +0000203/* Fast detection of the most frequent whitespace characters */
204const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000206/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000208/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x000C: * FORM FEED */
210/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 1, 1, 1, 1, 1, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x001C: * FILE SEPARATOR */
214/* case 0x001D: * GROUP SEPARATOR */
215/* case 0x001E: * RECORD SEPARATOR */
216/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 1, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000223
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000232};
233
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200234/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200235static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200236static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100237static int unicode_modifiable(PyObject *unicode);
238
Victor Stinnerfe226c02011-10-03 03:52:20 +0200239
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100241_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200242static PyObject *
243_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
244static PyObject *
245_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
246
247static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000248unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000249 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100250 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000251 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
252
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253static void
254raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300255 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100256 PyObject *unicode,
257 Py_ssize_t startpos, Py_ssize_t endpos,
258 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000259
Christian Heimes190d79e2008-01-30 11:58:22 +0000260/* Same for linebreaks */
261static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000262 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264/* 0x000B, * LINE TABULATION */
265/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x001C, * FILE SEPARATOR */
270/* 0x001D, * GROUP SEPARATOR */
271/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000272 0, 0, 0, 0, 1, 1, 1, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000277
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000286};
287
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300288/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
289 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000291PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000293#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 /* This is actually an illegal character, so it should
297 not be passed to unichr. */
298 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299#endif
300}
301
Victor Stinner910337b2011-10-03 03:20:16 +0200302#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200303int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100304_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200305{
306 PyASCIIObject *ascii;
307 unsigned int kind;
308
309 assert(PyUnicode_Check(op));
310
311 ascii = (PyASCIIObject *)op;
312 kind = ascii->state.kind;
313
Victor Stinnera3b334d2011-10-03 13:53:37 +0200314 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(ascii->state.ready == 1);
317 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200319 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200320 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200321
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 if (ascii->state.compact == 1) {
323 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200324 assert(kind == PyUnicode_1BYTE_KIND
325 || kind == PyUnicode_2BYTE_KIND
326 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 }
331 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
333
334 data = unicode->data.any;
335 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 assert(ascii->length == 0);
337 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.compact == 0);
339 assert(ascii->state.ascii == 0);
340 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 }
346 else {
347 assert(kind == PyUnicode_1BYTE_KIND
348 || kind == PyUnicode_2BYTE_KIND
349 || kind == PyUnicode_4BYTE_KIND);
350 assert(ascii->state.compact == 0);
351 assert(ascii->state.ready == 1);
352 assert(data != NULL);
353 if (ascii->state.ascii) {
354 assert (compact->utf8 == data);
355 assert (compact->utf8_length == ascii->length);
356 }
357 else
358 assert (compact->utf8 != data);
359 }
360 }
361 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200362 if (
363#if SIZEOF_WCHAR_T == 2
364 kind == PyUnicode_2BYTE_KIND
365#else
366 kind == PyUnicode_4BYTE_KIND
367#endif
368 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200369 {
370 assert(ascii->wstr == data);
371 assert(compact->wstr_length == ascii->length);
372 } else
373 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200375
376 if (compact->utf8 == NULL)
377 assert(compact->utf8_length == 0);
378 if (ascii->wstr == NULL)
379 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 /* check that the best kind is used */
382 if (check_content && kind != PyUnicode_WCHAR_KIND)
383 {
384 Py_ssize_t i;
385 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200386 void *data;
387 Py_UCS4 ch;
388
389 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 for (i=0; i < ascii->length; i++)
391 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 if (ch > maxchar)
394 maxchar = ch;
395 }
396 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100397 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 assert(maxchar <= 255);
400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 else
402 assert(maxchar < 128);
403 }
Victor Stinner77faf692011-11-20 18:56:05 +0100404 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 0xFFFF);
407 }
408 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100410 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200412 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200413 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400414 return 1;
415}
Victor Stinner910337b2011-10-03 03:20:16 +0200416#endif
417
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100418static PyObject*
419unicode_result_wchar(PyObject *unicode)
420{
421#ifndef Py_DEBUG
422 Py_ssize_t len;
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100426 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200427 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100432 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200440 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 return NULL;
442 }
443#else
Victor Stinneraa771272012-10-04 02:32:58 +0200444 assert(Py_REFCNT(unicode) == 1);
445
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 /* don't make the result ready in debug mode to ensure that the caller
447 makes the string ready before using it */
448 assert(_PyUnicode_CheckConsistency(unicode, 1));
449#endif
450 return unicode;
451}
452
453static PyObject*
454unicode_result_ready(PyObject *unicode)
455{
456 Py_ssize_t length;
457
458 length = PyUnicode_GET_LENGTH(unicode);
459 if (length == 0) {
460 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100461 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200462 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100463 }
464 return unicode_empty;
465 }
466
467 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200468 void *data = PyUnicode_DATA(unicode);
469 int kind = PyUnicode_KIND(unicode);
470 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100471 if (ch < 256) {
472 PyObject *latin1_char = unicode_latin1[ch];
473 if (latin1_char != NULL) {
474 if (unicode != latin1_char) {
475 Py_INCREF(latin1_char);
476 Py_DECREF(unicode);
477 }
478 return latin1_char;
479 }
480 else {
481 assert(_PyUnicode_CheckConsistency(unicode, 1));
482 Py_INCREF(unicode);
483 unicode_latin1[ch] = unicode;
484 return unicode;
485 }
486 }
487 }
488
489 assert(_PyUnicode_CheckConsistency(unicode, 1));
490 return unicode;
491}
492
493static PyObject*
494unicode_result(PyObject *unicode)
495{
496 assert(_PyUnicode_CHECK(unicode));
497 if (PyUnicode_IS_READY(unicode))
498 return unicode_result_ready(unicode);
499 else
500 return unicode_result_wchar(unicode);
501}
502
Victor Stinnerc4b49542011-12-11 22:44:26 +0100503static PyObject*
504unicode_result_unchanged(PyObject *unicode)
505{
506 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500507 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508 return NULL;
509 Py_INCREF(unicode);
510 return unicode;
511 }
512 else
513 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100514 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515}
516
Victor Stinner3a50e702011-10-18 21:21:00 +0200517#ifdef HAVE_MBCS
518static OSVERSIONINFOEX winver;
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521/* --- Bloom Filters ----------------------------------------------------- */
522
523/* stuff to implement simple "bloom filters" for Unicode characters.
524 to keep things simple, we use a single bitmask, using the least 5
525 bits from each unicode characters as the bit index. */
526
527/* the linebreak mask is set up by Unicode_Init below */
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#if LONG_BIT >= 128
530#define BLOOM_WIDTH 128
531#elif LONG_BIT >= 64
532#define BLOOM_WIDTH 64
533#elif LONG_BIT >= 32
534#define BLOOM_WIDTH 32
535#else
536#error "LONG_BIT is smaller than 32"
537#endif
538
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539#define BLOOM_MASK unsigned long
540
Serhiy Storchaka05997252013-01-26 12:14:02 +0200541static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542
Antoine Pitrouf068f942010-01-13 14:19:12 +0000543#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
Benjamin Peterson29060642009-01-31 22:14:21 +0000545#define BLOOM_LINEBREAK(ch) \
546 ((ch) < 128U ? ascii_linebreak[(ch)] : \
547 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
Alexander Belopolsky40018472011-02-26 01:02:56 +0000549Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551{
Victor Stinnera85af502013-04-09 21:53:54 +0200552#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
553 do { \
554 TYPE *data = (TYPE *)PTR; \
555 TYPE *end = data + LEN; \
556 Py_UCS4 ch; \
557 for (; data != end; data++) { \
558 ch = *data; \
559 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
560 } \
561 break; \
562 } while (0)
563
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564 /* calculate simple bloom-style bitmask for a given unicode string */
565
Antoine Pitrouf068f942010-01-13 14:19:12 +0000566 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567
568 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200569 switch (kind) {
570 case PyUnicode_1BYTE_KIND:
571 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
572 break;
573 case PyUnicode_2BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
575 break;
576 case PyUnicode_4BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
578 break;
579 default:
580 assert(0);
581 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000582 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200583
584#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585}
586
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200587/* Compilation of templated routines */
588
589#include "stringlib/asciilib.h"
590#include "stringlib/fastsearch.h"
591#include "stringlib/partition.h"
592#include "stringlib/split.h"
593#include "stringlib/count.h"
594#include "stringlib/find.h"
595#include "stringlib/find_max_char.h"
596#include "stringlib/localeutil.h"
597#include "stringlib/undef.h"
598
599#include "stringlib/ucs1lib.h"
600#include "stringlib/fastsearch.h"
601#include "stringlib/partition.h"
602#include "stringlib/split.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300605#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs2lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300616#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200617#include "stringlib/find_max_char.h"
618#include "stringlib/localeutil.h"
619#include "stringlib/undef.h"
620
621#include "stringlib/ucs4lib.h"
622#include "stringlib/fastsearch.h"
623#include "stringlib/partition.h"
624#include "stringlib/split.h"
625#include "stringlib/count.h"
626#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300627#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200628#include "stringlib/find_max_char.h"
629#include "stringlib/localeutil.h"
630#include "stringlib/undef.h"
631
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200632#include "stringlib/unicodedefs.h"
633#include "stringlib/fastsearch.h"
634#include "stringlib/count.h"
635#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100636#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638/* --- Unicode Object ----------------------------------------------------- */
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200641fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200643Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
644 Py_ssize_t size, Py_UCS4 ch,
645 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
648
649 switch (kind) {
650 case PyUnicode_1BYTE_KIND:
651 {
652 Py_UCS1 ch1 = (Py_UCS1) ch;
653 if (ch1 == ch)
654 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
655 else
656 return -1;
657 }
658 case PyUnicode_2BYTE_KIND:
659 {
660 Py_UCS2 ch2 = (Py_UCS2) ch;
661 if (ch2 == ch)
662 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
663 else
664 return -1;
665 }
666 case PyUnicode_4BYTE_KIND:
667 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
668 default:
669 assert(0);
670 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672}
673
Victor Stinnerafffce42012-10-03 23:03:17 +0200674#ifdef Py_DEBUG
675/* Fill the data of an Unicode string with invalid characters to detect bugs
676 earlier.
677
678 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
679 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
680 invalid character in Unicode 6.0. */
681static void
682unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
683{
684 int kind = PyUnicode_KIND(unicode);
685 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
686 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
687 if (length <= old_length)
688 return;
689 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
690}
691#endif
692
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693static PyObject*
694resize_compact(PyObject *unicode, Py_ssize_t length)
695{
696 Py_ssize_t char_size;
697 Py_ssize_t struct_size;
698 Py_ssize_t new_size;
699 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100700 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
703#endif
704
Victor Stinner79891572012-05-03 13:43:07 +0200705 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100707 assert(PyUnicode_IS_COMPACT(unicode));
708
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200709 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100710 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 struct_size = sizeof(PyASCIIObject);
712 else
713 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
717 PyErr_NoMemory();
718 return NULL;
719 }
720 new_size = (struct_size + (length + 1) * char_size);
721
Victor Stinner84def372011-12-11 20:04:56 +0100722 _Py_DEC_REFTOTAL;
723 _Py_ForgetReference(unicode);
724
725 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
726 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100727 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200728 PyErr_NoMemory();
729 return NULL;
730 }
Victor Stinner84def372011-12-11 20:04:56 +0100731 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100733
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100737 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 _PyUnicode_WSTR_LENGTH(unicode) = length;
739 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100740 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
741 PyObject_DEL(_PyUnicode_WSTR(unicode));
742 _PyUnicode_WSTR(unicode) = NULL;
743 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200744#ifdef Py_DEBUG
745 unicode_fill_invalid(unicode, old_length);
746#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
748 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 return unicode;
751}
752
Alexander Belopolsky40018472011-02-26 01:02:56 +0000753static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200754resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755{
Victor Stinner95663112011-10-04 01:03:50 +0200756 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000760
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 if (PyUnicode_IS_READY(unicode)) {
762 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200765#ifdef Py_DEBUG
766 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
767#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200770 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200771 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
772 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
775 PyErr_NoMemory();
776 return -1;
777 }
778 new_size = (length + 1) * char_size;
779
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
781 {
782 PyObject_DEL(_PyUnicode_UTF8(unicode));
783 _PyUnicode_UTF8(unicode) = NULL;
784 _PyUnicode_UTF8_LENGTH(unicode) = 0;
785 }
786
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 data = (PyObject *)PyObject_REALLOC(data, new_size);
788 if (data == NULL) {
789 PyErr_NoMemory();
790 return -1;
791 }
792 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200793 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200795 _PyUnicode_WSTR_LENGTH(unicode) = length;
796 }
797 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200798 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200799 _PyUnicode_UTF8_LENGTH(unicode) = length;
800 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200801 _PyUnicode_LENGTH(unicode) = length;
802 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200803#ifdef Py_DEBUG
804 unicode_fill_invalid(unicode, old_length);
805#endif
Victor Stinner95663112011-10-04 01:03:50 +0200806 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200807 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200809 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200810 }
Victor Stinner95663112011-10-04 01:03:50 +0200811 assert(_PyUnicode_WSTR(unicode) != NULL);
812
813 /* check for integer overflow */
814 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
815 PyErr_NoMemory();
816 return -1;
817 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100818 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200819 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100820 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200821 if (!wstr) {
822 PyErr_NoMemory();
823 return -1;
824 }
825 _PyUnicode_WSTR(unicode) = wstr;
826 _PyUnicode_WSTR(unicode)[length] = 0;
827 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200828 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 return 0;
830}
831
Victor Stinnerfe226c02011-10-03 03:52:20 +0200832static PyObject*
833resize_copy(PyObject *unicode, Py_ssize_t length)
834{
835 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100836 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100838
Benjamin Petersonbac79492012-01-14 13:34:47 -0500839 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100840 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200841
842 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
843 if (copy == NULL)
844 return NULL;
845
846 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200847 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200848 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200849 }
850 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200851 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100852
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200853 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200854 if (w == NULL)
855 return NULL;
856 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
857 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200858 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
859 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200860 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200861 }
862}
863
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000865 Ux0000 terminated; some code (e.g. new_identifier)
866 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
868 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000869 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871*/
872
Alexander Belopolsky40018472011-02-26 01:02:56 +0000873static PyUnicodeObject *
874_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200876 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880 if (length == 0 && unicode_empty != NULL) {
881 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200882 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 }
884
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000885 /* Ensure we won't overflow the size. */
886 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
887 return (PyUnicodeObject *)PyErr_NoMemory();
888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 if (length < 0) {
890 PyErr_SetString(PyExc_SystemError,
891 "Negative size passed to _PyUnicode_New");
892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
896 if (unicode == NULL)
897 return NULL;
898 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100899
900 _PyUnicode_WSTR_LENGTH(unicode) = length;
901 _PyUnicode_HASH(unicode) = -1;
902 _PyUnicode_STATE(unicode).interned = 0;
903 _PyUnicode_STATE(unicode).kind = 0;
904 _PyUnicode_STATE(unicode).compact = 0;
905 _PyUnicode_STATE(unicode).ready = 0;
906 _PyUnicode_STATE(unicode).ascii = 0;
907 _PyUnicode_DATA_ANY(unicode) = NULL;
908 _PyUnicode_LENGTH(unicode) = 0;
909 _PyUnicode_UTF8(unicode) = NULL;
910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
913 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100914 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000915 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100916 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918
Jeremy Hyltond8082792003-09-16 19:41:39 +0000919 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000920 * the caller fails before initializing str -- unicode_resize()
921 * reads str[0], and the Keep-Alive optimization can keep memory
922 * allocated for str alive across a call to unicode_dealloc(unicode).
923 * We don't want unicode_resize to read uninitialized memory in
924 * that case.
925 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926 _PyUnicode_WSTR(unicode)[0] = 0;
927 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100928
Victor Stinner7931d9a2011-11-04 00:22:48 +0100929 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000930 return unicode;
931}
932
Victor Stinnerf42dc442011-10-02 23:33:16 +0200933static const char*
934unicode_kind_name(PyObject *unicode)
935{
Victor Stinner42dfd712011-10-03 14:41:45 +0200936 /* don't check consistency: unicode_kind_name() is called from
937 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938 if (!PyUnicode_IS_COMPACT(unicode))
939 {
940 if (!PyUnicode_IS_READY(unicode))
941 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600942 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 {
944 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200945 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946 return "legacy ascii";
947 else
948 return "legacy latin1";
949 case PyUnicode_2BYTE_KIND:
950 return "legacy UCS2";
951 case PyUnicode_4BYTE_KIND:
952 return "legacy UCS4";
953 default:
954 return "<legacy invalid kind>";
955 }
956 }
957 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600958 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200959 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200960 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200961 return "ascii";
962 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200967 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200968 default:
969 return "<invalid compact kind>";
970 }
971}
972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974/* Functions wrapping macros for use in debugger */
975char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200976 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977}
978
979void *_PyUnicode_compact_data(void *unicode) {
980 return _PyUnicode_COMPACT_DATA(unicode);
981}
982void *_PyUnicode_data(void *unicode){
983 printf("obj %p\n", unicode);
984 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
985 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
986 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
987 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
988 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
989 return PyUnicode_DATA(unicode);
990}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200991
992void
993_PyUnicode_Dump(PyObject *op)
994{
995 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200996 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
997 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
998 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200999
Victor Stinnera849a4b2011-10-03 12:12:11 +02001000 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001001 {
1002 if (ascii->state.ascii)
1003 data = (ascii + 1);
1004 else
1005 data = (compact + 1);
1006 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001007 else
1008 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001009 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1010
Victor Stinnera849a4b2011-10-03 12:12:11 +02001011 if (ascii->wstr == data)
1012 printf("shared ");
1013 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001014
Victor Stinnera3b334d2011-10-03 13:53:37 +02001015 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 printf(" (%zu), ", compact->wstr_length);
1017 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1018 printf("shared ");
1019 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001020 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001022}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023#endif
1024
1025PyObject *
1026PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1027{
1028 PyObject *obj;
1029 PyCompactUnicodeObject *unicode;
1030 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001031 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001032 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035
1036 /* Optimization for empty strings */
1037 if (size == 0 && unicode_empty != NULL) {
1038 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001039 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 }
1041
Victor Stinner9e9d6892011-10-04 01:02:02 +02001042 is_ascii = 0;
1043 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 struct_size = sizeof(PyCompactUnicodeObject);
1045 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001046 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 char_size = 1;
1048 is_ascii = 1;
1049 struct_size = sizeof(PyASCIIObject);
1050 }
1051 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001052 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 char_size = 1;
1054 }
1055 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001056 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 char_size = 2;
1058 if (sizeof(wchar_t) == 2)
1059 is_sharing = 1;
1060 }
1061 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001062 if (maxchar > MAX_UNICODE) {
1063 PyErr_SetString(PyExc_SystemError,
1064 "invalid maximum character passed to PyUnicode_New");
1065 return NULL;
1066 }
Victor Stinner8f825062012-04-27 13:55:39 +02001067 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 char_size = 4;
1069 if (sizeof(wchar_t) == 4)
1070 is_sharing = 1;
1071 }
1072
1073 /* Ensure we won't overflow the size. */
1074 if (size < 0) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "Negative size passed to PyUnicode_New");
1077 return NULL;
1078 }
1079 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1080 return PyErr_NoMemory();
1081
1082 /* Duplicated allocation code from _PyObject_New() instead of a call to
1083 * PyObject_New() so we are able to allocate space for the object and
1084 * it's data buffer.
1085 */
1086 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1087 if (obj == NULL)
1088 return PyErr_NoMemory();
1089 obj = PyObject_INIT(obj, &PyUnicode_Type);
1090 if (obj == NULL)
1091 return NULL;
1092
1093 unicode = (PyCompactUnicodeObject *)obj;
1094 if (is_ascii)
1095 data = ((PyASCIIObject*)obj) + 1;
1096 else
1097 data = unicode + 1;
1098 _PyUnicode_LENGTH(unicode) = size;
1099 _PyUnicode_HASH(unicode) = -1;
1100 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001101 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 _PyUnicode_STATE(unicode).compact = 1;
1103 _PyUnicode_STATE(unicode).ready = 1;
1104 _PyUnicode_STATE(unicode).ascii = is_ascii;
1105 if (is_ascii) {
1106 ((char*)data)[size] = 0;
1107 _PyUnicode_WSTR(unicode) = NULL;
1108 }
Victor Stinner8f825062012-04-27 13:55:39 +02001109 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 ((char*)data)[size] = 0;
1111 _PyUnicode_WSTR(unicode) = NULL;
1112 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001114 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 else {
1117 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001118 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001119 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001121 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 ((Py_UCS4*)data)[size] = 0;
1123 if (is_sharing) {
1124 _PyUnicode_WSTR_LENGTH(unicode) = size;
1125 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1126 }
1127 else {
1128 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1129 _PyUnicode_WSTR(unicode) = NULL;
1130 }
1131 }
Victor Stinner8f825062012-04-27 13:55:39 +02001132#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001133 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001134#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001135 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136 return obj;
1137}
1138
1139#if SIZEOF_WCHAR_T == 2
1140/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1141 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001142 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
1144 This function assumes that unicode can hold one more code point than wstr
1145 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001146static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001148 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149{
1150 const wchar_t *iter;
1151 Py_UCS4 *ucs4_out;
1152
Victor Stinner910337b2011-10-03 03:20:16 +02001153 assert(unicode != NULL);
1154 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1156 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1157
1158 for (iter = begin; iter < end; ) {
1159 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1160 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001161 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1162 && (iter+1) < end
1163 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 {
Victor Stinner551ac952011-11-29 22:58:13 +01001165 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 iter += 2;
1167 }
1168 else {
1169 *ucs4_out++ = *iter;
1170 iter++;
1171 }
1172 }
1173 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1174 _PyUnicode_GET_LENGTH(unicode)));
1175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176}
1177#endif
1178
Victor Stinnercd9950f2011-10-02 00:34:53 +02001179static int
Victor Stinner488fa492011-12-12 00:01:39 +01001180unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001181{
Victor Stinner488fa492011-12-12 00:01:39 +01001182 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001183 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001184 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001185 return -1;
1186 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001187 return 0;
1188}
1189
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001190static int
1191_copy_characters(PyObject *to, Py_ssize_t to_start,
1192 PyObject *from, Py_ssize_t from_start,
1193 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001195 unsigned int from_kind, to_kind;
1196 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001197
Victor Stinneree4544c2012-05-09 22:24:08 +02001198 assert(0 <= how_many);
1199 assert(0 <= from_start);
1200 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001201 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204
Victor Stinnerd3f08822012-05-29 12:57:52 +02001205 assert(PyUnicode_Check(to));
1206 assert(PyUnicode_IS_READY(to));
1207 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1208
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001209 if (how_many == 0)
1210 return 0;
1211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001215 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216
Victor Stinnerf1852262012-06-16 16:38:26 +02001217#ifdef Py_DEBUG
1218 if (!check_maxchar
1219 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1220 {
1221 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1222 Py_UCS4 ch;
1223 Py_ssize_t i;
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 assert(ch <= to_maxchar);
1227 }
1228 }
1229#endif
1230
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001231 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001232 if (check_maxchar
1233 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1234 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 /* Writing Latin-1 characters into an ASCII string requires to
1236 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 Py_UCS4 max_char;
1238 max_char = ucs1lib_find_max_char(from_data,
1239 (Py_UCS1*)from_data + how_many);
1240 if (max_char >= 128)
1241 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001243 Py_MEMCPY((char*)to_data + to_kind * to_start,
1244 (char*)from_data + from_kind * from_start,
1245 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001247 else if (from_kind == PyUnicode_1BYTE_KIND
1248 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001249 {
1250 _PyUnicode_CONVERT_BYTES(
1251 Py_UCS1, Py_UCS2,
1252 PyUnicode_1BYTE_DATA(from) + from_start,
1253 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1254 PyUnicode_2BYTE_DATA(to) + to_start
1255 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001256 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001257 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001258 && to_kind == PyUnicode_4BYTE_KIND)
1259 {
1260 _PyUnicode_CONVERT_BYTES(
1261 Py_UCS1, Py_UCS4,
1262 PyUnicode_1BYTE_DATA(from) + from_start,
1263 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1264 PyUnicode_4BYTE_DATA(to) + to_start
1265 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001266 }
1267 else if (from_kind == PyUnicode_2BYTE_KIND
1268 && to_kind == PyUnicode_4BYTE_KIND)
1269 {
1270 _PyUnicode_CONVERT_BYTES(
1271 Py_UCS2, Py_UCS4,
1272 PyUnicode_2BYTE_DATA(from) + from_start,
1273 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1274 PyUnicode_4BYTE_DATA(to) + to_start
1275 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001276 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001277 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001278 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1279
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001280 if (!check_maxchar) {
1281 if (from_kind == PyUnicode_2BYTE_KIND
1282 && to_kind == PyUnicode_1BYTE_KIND)
1283 {
1284 _PyUnicode_CONVERT_BYTES(
1285 Py_UCS2, Py_UCS1,
1286 PyUnicode_2BYTE_DATA(from) + from_start,
1287 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1288 PyUnicode_1BYTE_DATA(to) + to_start
1289 );
1290 }
1291 else if (from_kind == PyUnicode_4BYTE_KIND
1292 && to_kind == PyUnicode_1BYTE_KIND)
1293 {
1294 _PyUnicode_CONVERT_BYTES(
1295 Py_UCS4, Py_UCS1,
1296 PyUnicode_4BYTE_DATA(from) + from_start,
1297 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1298 PyUnicode_1BYTE_DATA(to) + to_start
1299 );
1300 }
1301 else if (from_kind == PyUnicode_4BYTE_KIND
1302 && to_kind == PyUnicode_2BYTE_KIND)
1303 {
1304 _PyUnicode_CONVERT_BYTES(
1305 Py_UCS4, Py_UCS2,
1306 PyUnicode_4BYTE_DATA(from) + from_start,
1307 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1308 PyUnicode_2BYTE_DATA(to) + to_start
1309 );
1310 }
1311 else {
1312 assert(0);
1313 return -1;
1314 }
1315 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001316 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001317 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001318 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001319 Py_ssize_t i;
1320
Victor Stinnera0702ab2011-09-29 14:14:38 +02001321 for (i=0; i < how_many; i++) {
1322 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001323 if (ch > to_maxchar)
1324 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001325 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1326 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001327 }
1328 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001329 return 0;
1330}
1331
Victor Stinnerd3f08822012-05-29 12:57:52 +02001332void
1333_PyUnicode_FastCopyCharacters(
1334 PyObject *to, Py_ssize_t to_start,
1335 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001336{
1337 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1338}
1339
1340Py_ssize_t
1341PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1342 PyObject *from, Py_ssize_t from_start,
1343 Py_ssize_t how_many)
1344{
1345 int err;
1346
1347 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1348 PyErr_BadInternalCall();
1349 return -1;
1350 }
1351
Benjamin Petersonbac79492012-01-14 13:34:47 -05001352 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001353 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001354 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001355 return -1;
1356
Victor Stinnerd3f08822012-05-29 12:57:52 +02001357 if (from_start < 0) {
1358 PyErr_SetString(PyExc_IndexError, "string index out of range");
1359 return -1;
1360 }
1361 if (to_start < 0) {
1362 PyErr_SetString(PyExc_IndexError, "string index out of range");
1363 return -1;
1364 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1366 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1367 PyErr_Format(PyExc_SystemError,
1368 "Cannot write %zi characters at %zi "
1369 "in a string of %zi characters",
1370 how_many, to_start, PyUnicode_GET_LENGTH(to));
1371 return -1;
1372 }
1373
1374 if (how_many == 0)
1375 return 0;
1376
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001378 return -1;
1379
1380 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1381 if (err) {
1382 PyErr_Format(PyExc_SystemError,
1383 "Cannot copy %s characters "
1384 "into a string of %s characters",
1385 unicode_kind_name(from),
1386 unicode_kind_name(to));
1387 return -1;
1388 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390}
1391
Victor Stinner17222162011-09-28 22:15:37 +02001392/* Find the maximum code point and count the number of surrogate pairs so a
1393 correct string length can be computed before converting a string to UCS4.
1394 This function counts single surrogates as a character and not as a pair.
1395
1396 Return 0 on success, or -1 on error. */
1397static int
1398find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1399 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400{
1401 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001402 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403
Victor Stinnerc53be962011-10-02 21:33:54 +02001404 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 *num_surrogates = 0;
1406 *maxchar = 0;
1407
1408 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001410 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1411 && (iter+1) < end
1412 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1413 {
1414 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1415 ++(*num_surrogates);
1416 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 }
1418 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001420 {
1421 ch = *iter;
1422 iter++;
1423 }
1424 if (ch > *maxchar) {
1425 *maxchar = ch;
1426 if (*maxchar > MAX_UNICODE) {
1427 PyErr_Format(PyExc_ValueError,
1428 "character U+%x is not in range [U+0000; U+10ffff]",
1429 ch);
1430 return -1;
1431 }
1432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 return 0;
1435}
1436
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001437int
1438_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439{
1440 wchar_t *end;
1441 Py_UCS4 maxchar = 0;
1442 Py_ssize_t num_surrogates;
1443#if SIZEOF_WCHAR_T == 2
1444 Py_ssize_t length_wo_surrogates;
1445#endif
1446
Georg Brandl7597add2011-10-05 16:36:47 +02001447 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001448 strings were created using _PyObject_New() and where no canonical
1449 representation (the str field) has been set yet aka strings
1450 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001451 assert(_PyUnicode_CHECK(unicode));
1452 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001454 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001455 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001456 /* Actually, it should neither be interned nor be anything else: */
1457 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001460 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
1464 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001465 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1466 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 PyErr_NoMemory();
1468 return -1;
1469 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001470 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 _PyUnicode_WSTR(unicode), end,
1472 PyUnicode_1BYTE_DATA(unicode));
1473 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1474 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1475 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1476 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001477 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001478 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001479 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 }
1481 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001483 _PyUnicode_UTF8(unicode) = NULL;
1484 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 PyObject_FREE(_PyUnicode_WSTR(unicode));
1487 _PyUnicode_WSTR(unicode) = NULL;
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 }
1490 /* In this case we might have to convert down from 4-byte native
1491 wchar_t to 2-byte unicode. */
1492 else if (maxchar < 65536) {
1493 assert(num_surrogates == 0 &&
1494 "FindMaxCharAndNumSurrogatePairs() messed up");
1495
Victor Stinner506f5922011-09-28 22:34:18 +02001496#if SIZEOF_WCHAR_T == 2
1497 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001498 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001499 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1500 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1501 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001502 _PyUnicode_UTF8(unicode) = NULL;
1503 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001504#else
1505 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001507 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001508 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001509 PyErr_NoMemory();
1510 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 }
Victor Stinner506f5922011-09-28 22:34:18 +02001512 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1513 _PyUnicode_WSTR(unicode), end,
1514 PyUnicode_2BYTE_DATA(unicode));
1515 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1516 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1517 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001518 _PyUnicode_UTF8(unicode) = NULL;
1519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001520 PyObject_FREE(_PyUnicode_WSTR(unicode));
1521 _PyUnicode_WSTR(unicode) = NULL;
1522 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1523#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 }
1525 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1526 else {
1527#if SIZEOF_WCHAR_T == 2
1528 /* in case the native representation is 2-bytes, we need to allocate a
1529 new normalized 4-byte version. */
1530 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001531 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1532 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 PyErr_NoMemory();
1534 return -1;
1535 }
1536 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1537 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001538 _PyUnicode_UTF8(unicode) = NULL;
1539 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001540 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1541 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001542 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543 PyObject_FREE(_PyUnicode_WSTR(unicode));
1544 _PyUnicode_WSTR(unicode) = NULL;
1545 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1546#else
1547 assert(num_surrogates == 0);
1548
Victor Stinnerc3c74152011-10-02 20:39:55 +02001549 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001551 _PyUnicode_UTF8(unicode) = NULL;
1552 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1554#endif
1555 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1556 }
1557 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001558 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 return 0;
1560}
1561
Alexander Belopolsky40018472011-02-26 01:02:56 +00001562static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001563unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564{
Walter Dörwald16807132007-05-25 13:52:07 +00001565 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001566 case SSTATE_NOT_INTERNED:
1567 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001568
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 case SSTATE_INTERNED_MORTAL:
1570 /* revive dead object temporarily for DelItem */
1571 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001572 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 Py_FatalError(
1574 "deletion of interned string failed");
1575 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001576
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 case SSTATE_INTERNED_IMMORTAL:
1578 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001579
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 default:
1581 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001582 }
1583
Victor Stinner03490912011-10-03 23:45:12 +02001584 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001586 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001587 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001588 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1589 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001591 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592}
1593
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001594#ifdef Py_DEBUG
1595static int
1596unicode_is_singleton(PyObject *unicode)
1597{
1598 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1599 if (unicode == unicode_empty)
1600 return 1;
1601 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1602 {
1603 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1604 if (ch < 256 && unicode_latin1[ch] == unicode)
1605 return 1;
1606 }
1607 return 0;
1608}
1609#endif
1610
Alexander Belopolsky40018472011-02-26 01:02:56 +00001611static int
Victor Stinner488fa492011-12-12 00:01:39 +01001612unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001613{
Victor Stinner488fa492011-12-12 00:01:39 +01001614 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 if (Py_REFCNT(unicode) != 1)
1616 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001617 if (_PyUnicode_HASH(unicode) != -1)
1618 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001619 if (PyUnicode_CHECK_INTERNED(unicode))
1620 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001621 if (!PyUnicode_CheckExact(unicode))
1622 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001623#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001624 /* singleton refcount is greater than 1 */
1625 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001626#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001627 return 1;
1628}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629
Victor Stinnerfe226c02011-10-03 03:52:20 +02001630static int
1631unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1632{
1633 PyObject *unicode;
1634 Py_ssize_t old_length;
1635
1636 assert(p_unicode != NULL);
1637 unicode = *p_unicode;
1638
1639 assert(unicode != NULL);
1640 assert(PyUnicode_Check(unicode));
1641 assert(0 <= length);
1642
Victor Stinner910337b2011-10-03 03:20:16 +02001643 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001644 old_length = PyUnicode_WSTR_LENGTH(unicode);
1645 else
1646 old_length = PyUnicode_GET_LENGTH(unicode);
1647 if (old_length == length)
1648 return 0;
1649
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001650 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001651 _Py_INCREF_UNICODE_EMPTY();
1652 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001653 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 Py_DECREF(*p_unicode);
1655 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001656 return 0;
1657 }
1658
Victor Stinner488fa492011-12-12 00:01:39 +01001659 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001660 PyObject *copy = resize_copy(unicode, length);
1661 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001663 Py_DECREF(*p_unicode);
1664 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001665 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666 }
1667
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001669 PyObject *new_unicode = resize_compact(unicode, length);
1670 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001671 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001672 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001674 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001675 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001676}
1677
Alexander Belopolsky40018472011-02-26 01:02:56 +00001678int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001679PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001680{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001681 PyObject *unicode;
1682 if (p_unicode == NULL) {
1683 PyErr_BadInternalCall();
1684 return -1;
1685 }
1686 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001687 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688 {
1689 PyErr_BadInternalCall();
1690 return -1;
1691 }
1692 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001693}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001694
Victor Stinnerc5166102012-02-22 13:55:02 +01001695/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001696
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001697 WARNING: The function doesn't copy the terminating null character and
1698 doesn't check the maximum character (may write a latin1 character in an
1699 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001700static void
1701unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1702 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001703{
1704 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1705 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001706 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001707
1708 switch (kind) {
1709 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001710 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001711#ifdef Py_DEBUG
1712 if (PyUnicode_IS_ASCII(unicode)) {
1713 Py_UCS4 maxchar = ucs1lib_find_max_char(
1714 (const Py_UCS1*)str,
1715 (const Py_UCS1*)str + len);
1716 assert(maxchar < 128);
1717 }
1718#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001719 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001720 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001721 }
1722 case PyUnicode_2BYTE_KIND: {
1723 Py_UCS2 *start = (Py_UCS2 *)data + index;
1724 Py_UCS2 *ucs2 = start;
1725 assert(index <= PyUnicode_GET_LENGTH(unicode));
1726
Victor Stinner184252a2012-06-16 02:57:41 +02001727 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 *ucs2 = (Py_UCS2)*str;
1729
1730 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001731 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 }
1733 default: {
1734 Py_UCS4 *start = (Py_UCS4 *)data + index;
1735 Py_UCS4 *ucs4 = start;
1736 assert(kind == PyUnicode_4BYTE_KIND);
1737 assert(index <= PyUnicode_GET_LENGTH(unicode));
1738
Victor Stinner184252a2012-06-16 02:57:41 +02001739 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001740 *ucs4 = (Py_UCS4)*str;
1741
1742 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001743 }
1744 }
1745}
1746
1747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748static PyObject*
1749get_latin1_char(unsigned char ch)
1750{
Victor Stinnera464fc12011-10-02 20:39:30 +02001751 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001753 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 if (!unicode)
1755 return NULL;
1756 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001757 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 unicode_latin1[ch] = unicode;
1759 }
1760 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001761 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Alexander Belopolsky40018472011-02-26 01:02:56 +00001764PyObject *
1765PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001767 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 Py_UCS4 maxchar = 0;
1769 Py_ssize_t num_surrogates;
1770
1771 if (u == NULL)
1772 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001774 /* If the Unicode data is known at construction time, we can apply
1775 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001778 if (size == 0)
1779 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 /* Single character Unicode objects in the Latin-1 range are
1782 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001783 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 return get_latin1_char((unsigned char)*u);
1785
1786 /* If not empty and not single character, copy the Unicode data
1787 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001788 if (find_maxchar_surrogates(u, u + size,
1789 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 return NULL;
1791
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 if (!unicode)
1794 return NULL;
1795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 switch (PyUnicode_KIND(unicode)) {
1797 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001798 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1800 break;
1801 case PyUnicode_2BYTE_KIND:
1802#if Py_UNICODE_SIZE == 2
1803 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1804#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001805 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1807#endif
1808 break;
1809 case PyUnicode_4BYTE_KIND:
1810#if SIZEOF_WCHAR_T == 2
1811 /* This is the only case which has to process surrogates, thus
1812 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001813 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814#else
1815 assert(num_surrogates == 0);
1816 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1817#endif
1818 break;
1819 default:
1820 assert(0 && "Impossible state");
1821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001823 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824}
1825
Alexander Belopolsky40018472011-02-26 01:02:56 +00001826PyObject *
1827PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001828{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001829 if (size < 0) {
1830 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001831 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 return NULL;
1833 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001834 if (u != NULL)
1835 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1836 else
1837 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001838}
1839
Alexander Belopolsky40018472011-02-26 01:02:56 +00001840PyObject *
1841PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001842{
1843 size_t size = strlen(u);
1844 if (size > PY_SSIZE_T_MAX) {
1845 PyErr_SetString(PyExc_OverflowError, "input too long");
1846 return NULL;
1847 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001848 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001849}
1850
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001851PyObject *
1852_PyUnicode_FromId(_Py_Identifier *id)
1853{
1854 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001855 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1856 strlen(id->string),
1857 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001858 if (!id->object)
1859 return NULL;
1860 PyUnicode_InternInPlace(&id->object);
1861 assert(!id->next);
1862 id->next = static_strings;
1863 static_strings = id;
1864 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001865 return id->object;
1866}
1867
1868void
1869_PyUnicode_ClearStaticStrings()
1870{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001871 _Py_Identifier *tmp, *s = static_strings;
1872 while (s) {
1873 Py_DECREF(s->object);
1874 s->object = NULL;
1875 tmp = s->next;
1876 s->next = NULL;
1877 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001878 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001879 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001880}
1881
Benjamin Peterson0df54292012-03-26 14:50:32 -04001882/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001883
Victor Stinnerd3f08822012-05-29 12:57:52 +02001884PyObject*
1885_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001886{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001887 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001888 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001889 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001890#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001891 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001892#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001893 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001894 }
Victor Stinner785938e2011-12-11 20:09:03 +01001895 unicode = PyUnicode_New(size, 127);
1896 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001897 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001898 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1899 assert(_PyUnicode_CheckConsistency(unicode, 1));
1900 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001901}
1902
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001903static Py_UCS4
1904kind_maxchar_limit(unsigned int kind)
1905{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001906 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907 case PyUnicode_1BYTE_KIND:
1908 return 0x80;
1909 case PyUnicode_2BYTE_KIND:
1910 return 0x100;
1911 case PyUnicode_4BYTE_KIND:
1912 return 0x10000;
1913 default:
1914 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001915 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001916 }
1917}
1918
Victor Stinnere6abb482012-05-02 01:15:40 +02001919Py_LOCAL_INLINE(Py_UCS4)
1920align_maxchar(Py_UCS4 maxchar)
1921{
1922 if (maxchar <= 127)
1923 return 127;
1924 else if (maxchar <= 255)
1925 return 255;
1926 else if (maxchar <= 65535)
1927 return 65535;
1928 else
1929 return MAX_UNICODE;
1930}
1931
Victor Stinner702c7342011-10-05 13:50:52 +02001932static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001933_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001937
Serhiy Storchaka678db842013-01-26 12:16:36 +02001938 if (size == 0)
1939 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001940 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001941 if (size == 1)
1942 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001943
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001945 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 if (!res)
1947 return NULL;
1948 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001949 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001951}
1952
Victor Stinnere57b1c02011-09-28 22:20:48 +02001953static PyObject*
1954_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955{
1956 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001957 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001958
Serhiy Storchaka678db842013-01-26 12:16:36 +02001959 if (size == 0)
1960 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001961 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001962 if (size == 1) {
1963 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001964 int kind;
1965 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001966 if (ch < 256)
1967 return get_latin1_char((unsigned char)ch);
1968
1969 res = PyUnicode_New(1, ch);
1970 if (res == NULL)
1971 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001972 kind = PyUnicode_KIND(res);
1973 data = PyUnicode_DATA(res);
1974 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001975 assert(_PyUnicode_CheckConsistency(res, 1));
1976 return res;
1977 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001979 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 if (!res)
1982 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001983 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001985 else {
1986 _PyUnicode_CONVERT_BYTES(
1987 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1988 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001989 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 return res;
1991}
1992
Victor Stinnere57b1c02011-09-28 22:20:48 +02001993static PyObject*
1994_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995{
1996 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998
Serhiy Storchaka678db842013-01-26 12:16:36 +02001999 if (size == 0)
2000 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002002 if (size == 1) {
2003 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002004 int kind;
2005 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002006 if (ch < 256)
2007 return get_latin1_char((unsigned char)ch);
2008
2009 res = PyUnicode_New(1, ch);
2010 if (res == NULL)
2011 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002012 kind = PyUnicode_KIND(res);
2013 data = PyUnicode_DATA(res);
2014 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002015 assert(_PyUnicode_CheckConsistency(res, 1));
2016 return res;
2017 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002019 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002020 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 if (!res)
2022 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002023 if (max_char < 256)
2024 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2025 PyUnicode_1BYTE_DATA(res));
2026 else if (max_char < 0x10000)
2027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2028 PyUnicode_2BYTE_DATA(res));
2029 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002031 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 return res;
2033}
2034
2035PyObject*
2036PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2037{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002038 if (size < 0) {
2039 PyErr_SetString(PyExc_ValueError, "size must be positive");
2040 return NULL;
2041 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002042 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002044 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002046 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002048 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002049 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002050 PyErr_SetString(PyExc_SystemError, "invalid kind");
2051 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053}
2054
Victor Stinnerece58de2012-04-23 23:36:38 +02002055Py_UCS4
2056_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2057{
2058 enum PyUnicode_Kind kind;
2059 void *startptr, *endptr;
2060
2061 assert(PyUnicode_IS_READY(unicode));
2062 assert(0 <= start);
2063 assert(end <= PyUnicode_GET_LENGTH(unicode));
2064 assert(start <= end);
2065
2066 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2067 return PyUnicode_MAX_CHAR_VALUE(unicode);
2068
2069 if (start == end)
2070 return 127;
2071
Victor Stinner94d558b2012-04-27 22:26:58 +02002072 if (PyUnicode_IS_ASCII(unicode))
2073 return 127;
2074
Victor Stinnerece58de2012-04-23 23:36:38 +02002075 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002076 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002077 endptr = (char *)startptr + end * kind;
2078 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002079 switch(kind) {
2080 case PyUnicode_1BYTE_KIND:
2081 return ucs1lib_find_max_char(startptr, endptr);
2082 case PyUnicode_2BYTE_KIND:
2083 return ucs2lib_find_max_char(startptr, endptr);
2084 case PyUnicode_4BYTE_KIND:
2085 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002086 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002087 assert(0);
2088 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 }
2090}
2091
Victor Stinner25a4b292011-10-06 12:31:55 +02002092/* Ensure that a string uses the most efficient storage, if it is not the
2093 case: create a new string with of the right kind. Write NULL into *p_unicode
2094 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002095static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002096unicode_adjust_maxchar(PyObject **p_unicode)
2097{
2098 PyObject *unicode, *copy;
2099 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002100 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002101 unsigned int kind;
2102
2103 assert(p_unicode != NULL);
2104 unicode = *p_unicode;
2105 assert(PyUnicode_IS_READY(unicode));
2106 if (PyUnicode_IS_ASCII(unicode))
2107 return;
2108
2109 len = PyUnicode_GET_LENGTH(unicode);
2110 kind = PyUnicode_KIND(unicode);
2111 if (kind == PyUnicode_1BYTE_KIND) {
2112 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002113 max_char = ucs1lib_find_max_char(u, u + len);
2114 if (max_char >= 128)
2115 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002116 }
2117 else if (kind == PyUnicode_2BYTE_KIND) {
2118 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002119 max_char = ucs2lib_find_max_char(u, u + len);
2120 if (max_char >= 256)
2121 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 }
2123 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002124 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002125 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002126 max_char = ucs4lib_find_max_char(u, u + len);
2127 if (max_char >= 0x10000)
2128 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002131 if (copy != NULL)
2132 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 Py_DECREF(unicode);
2134 *p_unicode = copy;
2135}
2136
Victor Stinner034f6cf2011-09-30 02:26:44 +02002137PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002138_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002139{
Victor Stinner87af4f22011-11-21 23:03:47 +01002140 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002141 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002142
Victor Stinner034f6cf2011-09-30 02:26:44 +02002143 if (!PyUnicode_Check(unicode)) {
2144 PyErr_BadInternalCall();
2145 return NULL;
2146 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002147 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002148 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149
Victor Stinner87af4f22011-11-21 23:03:47 +01002150 length = PyUnicode_GET_LENGTH(unicode);
2151 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152 if (!copy)
2153 return NULL;
2154 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2155
Victor Stinner87af4f22011-11-21 23:03:47 +01002156 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2157 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002158 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002159 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002160}
2161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163/* Widen Unicode objects to larger buffers. Don't write terminating null
2164 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165
2166void*
2167_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2168{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002169 Py_ssize_t len;
2170 void *result;
2171 unsigned int skind;
2172
Benjamin Petersonbac79492012-01-14 13:34:47 -05002173 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 return NULL;
2175
2176 len = PyUnicode_GET_LENGTH(s);
2177 skind = PyUnicode_KIND(s);
2178 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002179 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return NULL;
2181 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002182 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 case PyUnicode_2BYTE_KIND:
2184 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2185 if (!result)
2186 return PyErr_NoMemory();
2187 assert(skind == PyUnicode_1BYTE_KIND);
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS1, Py_UCS2,
2190 PyUnicode_1BYTE_DATA(s),
2191 PyUnicode_1BYTE_DATA(s) + len,
2192 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002194 case PyUnicode_4BYTE_KIND:
2195 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2196 if (!result)
2197 return PyErr_NoMemory();
2198 if (skind == PyUnicode_2BYTE_KIND) {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS4,
2201 PyUnicode_2BYTE_DATA(s),
2202 PyUnicode_2BYTE_DATA(s) + len,
2203 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002205 else {
2206 assert(skind == PyUnicode_1BYTE_KIND);
2207 _PyUnicode_CONVERT_BYTES(
2208 Py_UCS1, Py_UCS4,
2209 PyUnicode_1BYTE_DATA(s),
2210 PyUnicode_1BYTE_DATA(s) + len,
2211 result);
2212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002214 default:
2215 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 }
Victor Stinner01698042011-10-04 00:04:26 +02002217 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return NULL;
2219}
2220
2221static Py_UCS4*
2222as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2223 int copy_null)
2224{
2225 int kind;
2226 void *data;
2227 Py_ssize_t len, targetlen;
2228 if (PyUnicode_READY(string) == -1)
2229 return NULL;
2230 kind = PyUnicode_KIND(string);
2231 data = PyUnicode_DATA(string);
2232 len = PyUnicode_GET_LENGTH(string);
2233 targetlen = len;
2234 if (copy_null)
2235 targetlen++;
2236 if (!target) {
2237 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2238 PyErr_NoMemory();
2239 return NULL;
2240 }
2241 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2242 if (!target) {
2243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 }
2247 else {
2248 if (targetsize < targetlen) {
2249 PyErr_Format(PyExc_SystemError,
2250 "string is longer than the buffer");
2251 if (copy_null && 0 < targetsize)
2252 target[0] = 0;
2253 return NULL;
2254 }
2255 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 if (kind == PyUnicode_1BYTE_KIND) {
2257 Py_UCS1 *start = (Py_UCS1 *) data;
2258 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 else if (kind == PyUnicode_2BYTE_KIND) {
2261 Py_UCS2 *start = (Py_UCS2 *) data;
2262 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2263 }
2264 else {
2265 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 if (copy_null)
2269 target[len] = 0;
2270 return target;
2271}
2272
2273Py_UCS4*
2274PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2275 int copy_null)
2276{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002277 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 PyErr_BadInternalCall();
2279 return NULL;
2280 }
2281 return as_ucs4(string, target, targetsize, copy_null);
2282}
2283
2284Py_UCS4*
2285PyUnicode_AsUCS4Copy(PyObject *string)
2286{
2287 return as_ucs4(string, NULL, 0, 1);
2288}
2289
2290#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002291
Alexander Belopolsky40018472011-02-26 01:02:56 +00002292PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002293PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002297 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 PyErr_BadInternalCall();
2299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 }
2301
Martin v. Löwis790465f2008-04-05 20:41:37 +00002302 if (size == -1) {
2303 size = wcslen(w);
2304 }
2305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307}
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002310
Walter Dörwald346737f2007-05-31 10:44:43 +00002311static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002312makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002313 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002314{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002315 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002316 if (longflag)
2317 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002318 else if (longlongflag) {
2319 /* longlongflag should only ever be nonzero on machines with
2320 HAVE_LONG_LONG defined */
2321#ifdef HAVE_LONG_LONG
2322 char *f = PY_FORMAT_LONG_LONG;
2323 while (*f)
2324 *fmt++ = *f++;
2325#else
2326 /* we shouldn't ever get here */
2327 assert(0);
2328 *fmt++ = 'l';
2329#endif
2330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 else if (size_tflag) {
2332 char *f = PY_FORMAT_SIZE_T;
2333 while (*f)
2334 *fmt++ = *f++;
2335 }
2336 *fmt++ = c;
2337 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002338}
2339
Victor Stinner15a11362012-10-06 23:48:20 +02002340/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002341 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2342 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2343#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002344
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002345static int
2346unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2347 Py_ssize_t width, Py_ssize_t precision)
2348{
2349 Py_ssize_t length, fill, arglen;
2350 Py_UCS4 maxchar;
2351
2352 if (PyUnicode_READY(str) == -1)
2353 return -1;
2354
2355 length = PyUnicode_GET_LENGTH(str);
2356 if ((precision == -1 || precision >= length)
2357 && width <= length)
2358 return _PyUnicodeWriter_WriteStr(writer, str);
2359
2360 if (precision != -1)
2361 length = Py_MIN(precision, length);
2362
2363 arglen = Py_MAX(length, width);
2364 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2365 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2366 else
2367 maxchar = writer->maxchar;
2368
2369 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2370 return -1;
2371
2372 if (width > length) {
2373 fill = width - length;
2374 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2375 return -1;
2376 writer->pos += fill;
2377 }
2378
2379 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2380 str, 0, length);
2381 writer->pos += length;
2382 return 0;
2383}
2384
2385static int
2386unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2387 Py_ssize_t width, Py_ssize_t precision)
2388{
2389 /* UTF-8 */
2390 Py_ssize_t length;
2391 PyObject *unicode;
2392 int res;
2393
2394 length = strlen(str);
2395 if (precision != -1)
2396 length = Py_MIN(length, precision);
2397 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2398 if (unicode == NULL)
2399 return -1;
2400
2401 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2402 Py_DECREF(unicode);
2403 return res;
2404}
2405
Victor Stinner96865452011-03-01 23:44:09 +00002406static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002407unicode_fromformat_arg(_PyUnicodeWriter *writer,
2408 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002409{
Victor Stinnere215d962012-10-06 23:03:36 +02002410 const char *p;
2411 Py_ssize_t len;
2412 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 Py_ssize_t width;
2414 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002415 int longflag;
2416 int longlongflag;
2417 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002419
2420 p = f;
2421 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002422 zeropad = 0;
2423 if (*f == '0') {
2424 zeropad = 1;
2425 f++;
2426 }
Victor Stinner96865452011-03-01 23:44:09 +00002427
2428 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002429 width = -1;
2430 if (Py_ISDIGIT((unsigned)*f)) {
2431 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002432 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002433 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002434 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002435 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002436 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002437 return NULL;
2438 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002440 f++;
2441 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002442 }
2443 precision = -1;
2444 if (*f == '.') {
2445 f++;
2446 if (Py_ISDIGIT((unsigned)*f)) {
2447 precision = (*f - '0');
2448 f++;
2449 while (Py_ISDIGIT((unsigned)*f)) {
2450 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2451 PyErr_SetString(PyExc_ValueError,
2452 "precision too big");
2453 return NULL;
2454 }
2455 precision = (precision * 10) + (*f - '0');
2456 f++;
2457 }
2458 }
Victor Stinner96865452011-03-01 23:44:09 +00002459 if (*f == '%') {
2460 /* "%.3%s" => f points to "3" */
2461 f--;
2462 }
2463 }
2464 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002465 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002466 f--;
2467 }
Victor Stinner96865452011-03-01 23:44:09 +00002468
2469 /* Handle %ld, %lu, %lld and %llu. */
2470 longflag = 0;
2471 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002472 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002473 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002474 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002475 longflag = 1;
2476 ++f;
2477 }
2478#ifdef HAVE_LONG_LONG
2479 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002480 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002481 longlongflag = 1;
2482 f += 2;
2483 }
2484#endif
2485 }
2486 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002487 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002488 size_tflag = 1;
2489 ++f;
2490 }
Victor Stinnere215d962012-10-06 23:03:36 +02002491
2492 if (f[1] == '\0')
2493 writer->overallocate = 0;
2494
2495 switch (*f) {
2496 case 'c':
2497 {
2498 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002499 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002500 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002501 "character argument not in range(0x110000)");
2502 return NULL;
2503 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002504 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002505 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002506 break;
2507 }
2508
2509 case 'i':
2510 case 'd':
2511 case 'u':
2512 case 'x':
2513 {
2514 /* used by sprintf */
2515 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002516 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002517 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002518
2519 if (*f == 'u') {
2520 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2521
2522 if (longflag)
2523 len = sprintf(buffer, fmt,
2524 va_arg(*vargs, unsigned long));
2525#ifdef HAVE_LONG_LONG
2526 else if (longlongflag)
2527 len = sprintf(buffer, fmt,
2528 va_arg(*vargs, unsigned PY_LONG_LONG));
2529#endif
2530 else if (size_tflag)
2531 len = sprintf(buffer, fmt,
2532 va_arg(*vargs, size_t));
2533 else
2534 len = sprintf(buffer, fmt,
2535 va_arg(*vargs, unsigned int));
2536 }
2537 else if (*f == 'x') {
2538 makefmt(fmt, 0, 0, 0, 'x');
2539 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2540 }
2541 else {
2542 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2543
2544 if (longflag)
2545 len = sprintf(buffer, fmt,
2546 va_arg(*vargs, long));
2547#ifdef HAVE_LONG_LONG
2548 else if (longlongflag)
2549 len = sprintf(buffer, fmt,
2550 va_arg(*vargs, PY_LONG_LONG));
2551#endif
2552 else if (size_tflag)
2553 len = sprintf(buffer, fmt,
2554 va_arg(*vargs, Py_ssize_t));
2555 else
2556 len = sprintf(buffer, fmt,
2557 va_arg(*vargs, int));
2558 }
2559 assert(len >= 0);
2560
Victor Stinnere215d962012-10-06 23:03:36 +02002561 if (precision < len)
2562 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002563
2564 arglen = Py_MAX(precision, width);
2565 assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127);
2566 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2567 return NULL;
2568
Victor Stinnere215d962012-10-06 23:03:36 +02002569 if (width > precision) {
2570 Py_UCS4 fillchar;
2571 fill = width - precision;
2572 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002573 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2574 return NULL;
2575 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002576 }
Victor Stinner15a11362012-10-06 23:48:20 +02002577 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002578 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002579 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2580 return NULL;
2581 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002582 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583
2584 unicode_write_cstr(writer->buffer, writer->pos, buffer, len);
2585 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002586 break;
2587 }
2588
2589 case 'p':
2590 {
2591 char number[MAX_LONG_LONG_CHARS];
2592
2593 len = sprintf(number, "%p", va_arg(*vargs, void*));
2594 assert(len >= 0);
2595
2596 /* %p is ill-defined: ensure leading 0x. */
2597 if (number[1] == 'X')
2598 number[1] = 'x';
2599 else if (number[1] != 'x') {
2600 memmove(number + 2, number,
2601 strlen(number) + 1);
2602 number[0] = '0';
2603 number[1] = 'x';
2604 len += 2;
2605 }
2606
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002607 assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127);
2608 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002609 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002610 unicode_write_cstr(writer->buffer, writer->pos, number, len);
2611 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002612 break;
2613 }
2614
2615 case 's':
2616 {
2617 /* UTF-8 */
2618 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002620 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002621 break;
2622 }
2623
2624 case 'U':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 assert(obj && _PyUnicode_CHECK(obj));
2628
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002630 return NULL;
2631 break;
2632 }
2633
2634 case 'V':
2635 {
2636 PyObject *obj = va_arg(*vargs, PyObject *);
2637 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002638 if (obj) {
2639 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002640 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002641 return NULL;
2642 }
2643 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 assert(str != NULL);
2645 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002646 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002647 }
2648 break;
2649 }
2650
2651 case 'S':
2652 {
2653 PyObject *obj = va_arg(*vargs, PyObject *);
2654 PyObject *str;
2655 assert(obj);
2656 str = PyObject_Str(obj);
2657 if (!str)
2658 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002659 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002660 Py_DECREF(str);
2661 return NULL;
2662 }
2663 Py_DECREF(str);
2664 break;
2665 }
2666
2667 case 'R':
2668 {
2669 PyObject *obj = va_arg(*vargs, PyObject *);
2670 PyObject *repr;
2671 assert(obj);
2672 repr = PyObject_Repr(obj);
2673 if (!repr)
2674 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 Py_DECREF(repr);
2677 return NULL;
2678 }
2679 Py_DECREF(repr);
2680 break;
2681 }
2682
2683 case 'A':
2684 {
2685 PyObject *obj = va_arg(*vargs, PyObject *);
2686 PyObject *ascii;
2687 assert(obj);
2688 ascii = PyObject_ASCII(obj);
2689 if (!ascii)
2690 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002692 Py_DECREF(ascii);
2693 return NULL;
2694 }
2695 Py_DECREF(ascii);
2696 break;
2697 }
2698
2699 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002700 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002701 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002702 break;
2703
2704 default:
2705 /* if we stumble upon an unknown formatting code, copy the rest
2706 of the format string to the output string. (we cannot just
2707 skip the code, since there's no way to know what's in the
2708 argument list) */
2709 len = strlen(p);
2710 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2711 return NULL;
2712 f = p+len;
2713 return f;
2714 }
2715
2716 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002717 return f;
2718}
2719
Walter Dörwaldd2034312007-05-18 16:29:38 +00002720PyObject *
2721PyUnicode_FromFormatV(const char *format, va_list vargs)
2722{
Victor Stinnere215d962012-10-06 23:03:36 +02002723 va_list vargs2;
2724 const char *f;
2725 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002726
Victor Stinner8f674cc2013-04-17 23:02:17 +02002727 _PyUnicodeWriter_Init(&writer);
2728 writer.min_length = strlen(format) + 100;
2729 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002730
2731 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2732 Copy it to be able to pass a reference to a subfunction. */
2733 Py_VA_COPY(vargs2, vargs);
2734
2735 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002737 f = unicode_fromformat_arg(&writer, f, &vargs2);
2738 if (f == NULL)
2739 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 const char *p;
2743 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002744
Victor Stinnere215d962012-10-06 23:03:36 +02002745 p = f;
2746 do
2747 {
2748 if ((unsigned char)*p > 127) {
2749 PyErr_Format(PyExc_ValueError,
2750 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2751 "string, got a non-ASCII byte: 0x%02x",
2752 (unsigned char)*p);
2753 return NULL;
2754 }
2755 p++;
2756 }
2757 while (*p != '\0' && *p != '%');
2758 len = p - f;
2759
2760 if (*p == '\0')
2761 writer.overallocate = 0;
2762 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2763 goto fail;
2764 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2765 writer.pos += len;
2766
2767 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 }
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return _PyUnicodeWriter_Finish(&writer);
2771
2772 fail:
2773 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775}
2776
Walter Dörwaldd2034312007-05-18 16:29:38 +00002777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002780 PyObject* ret;
2781 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782
2783#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 ret = PyUnicode_FromFormatV(format, vargs);
2789 va_end(vargs);
2790 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002791}
2792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793#ifdef HAVE_WCHAR_H
2794
Victor Stinner5593d8a2010-10-02 11:11:27 +00002795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796 convert a Unicode object to a wide character string.
2797
Victor Stinnerd88d9832011-09-06 02:00:05 +02002798 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002799 character) required to convert the unicode object. Ignore size argument.
2800
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002803 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002806 wchar_t *w,
2807 Py_ssize_t size)
2808{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 const wchar_t *wstr;
2811
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (wstr == NULL)
2814 return -1;
2815
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002817 if (size > res)
2818 size = res + 1;
2819 else
2820 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002822 return res;
2823 }
2824 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002826}
2827
2828Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002829PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 wchar_t *w,
2831 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832{
2833 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 PyErr_BadInternalCall();
2835 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002837 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838}
2839
Victor Stinner137c34c2010-09-29 10:25:54 +00002840wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002841PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002842 Py_ssize_t *size)
2843{
2844 wchar_t* buffer;
2845 Py_ssize_t buflen;
2846
2847 if (unicode == NULL) {
2848 PyErr_BadInternalCall();
2849 return NULL;
2850 }
2851
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002852 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002853 if (buflen == -1)
2854 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002855 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002856 PyErr_NoMemory();
2857 return NULL;
2858 }
2859
Victor Stinner137c34c2010-09-29 10:25:54 +00002860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2861 if (buffer == NULL) {
2862 PyErr_NoMemory();
2863 return NULL;
2864 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002865 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002866 if (buflen == -1) {
2867 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002869 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 if (size != NULL)
2871 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002872 return buffer;
2873}
2874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002875#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Alexander Belopolsky40018472011-02-26 01:02:56 +00002877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002881 void *data;
2882 int kind;
2883
Victor Stinner8faf8212011-12-08 22:14:11 +01002884 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 PyErr_SetString(PyExc_ValueError,
2886 "chr() arg not in range(0x110000)");
2887 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002888 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002889
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002890 if ((Py_UCS4)ordinal < 256)
2891 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 v = PyUnicode_New(1, ordinal);
2894 if (v == NULL)
2895 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002896 kind = PyUnicode_KIND(v);
2897 data = PyUnicode_DATA(v);
2898 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002899 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002901}
2902
Alexander Belopolsky40018472011-02-26 01:02:56 +00002903PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002904PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002906 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002908 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002909 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002910 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 Py_INCREF(obj);
2912 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002913 }
2914 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 /* For a Unicode subtype that's not a Unicode object,
2916 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002917 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002918 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002919 PyErr_Format(PyExc_TypeError,
2920 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002921 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002922 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002923}
2924
Alexander Belopolsky40018472011-02-26 01:02:56 +00002925PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002926PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002927 const char *encoding,
2928 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002929{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002930 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002931 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002932
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 PyErr_BadInternalCall();
2935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Decoding bytes objects is the most common case and should be fast */
2939 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002940 if (PyBytes_GET_SIZE(obj) == 0)
2941 _Py_RETURN_UNICODE_EMPTY();
2942 v = PyUnicode_Decode(
2943 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2944 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002945 return v;
2946 }
2947
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002948 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 PyErr_SetString(PyExc_TypeError,
2950 "decoding str is not supported");
2951 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002953
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002954 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2955 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2956 PyErr_Format(PyExc_TypeError,
2957 "coercing to str: need bytes, bytearray "
2958 "or buffer-like object, %.80s found",
2959 Py_TYPE(obj)->tp_name);
2960 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002961 }
Tim Petersced69f82003-09-16 20:30:58 +00002962
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002963 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002964 PyBuffer_Release(&buffer);
2965 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002967
Serhiy Storchaka05997252013-01-26 12:14:02 +02002968 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002969 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002970 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971}
2972
Victor Stinner600d3be2010-06-10 12:00:55 +00002973/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002974 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2975 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002976int
2977_Py_normalize_encoding(const char *encoding,
2978 char *lower,
2979 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002981 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002982 char *l;
2983 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002985 if (encoding == NULL) {
Victor Stinnerdf23e302013-11-07 13:33:36 +01002986 if (lower_len < 6)
2987 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002988 strcpy(lower, "utf-8");
2989 return 1;
2990 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002991 e = encoding;
2992 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002993 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002994 while (*e) {
2995 if (l == l_end)
2996 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002997 if (Py_ISUPPER(*e)) {
2998 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002999 }
3000 else if (*e == '_') {
3001 *l++ = '-';
3002 e++;
3003 }
3004 else {
3005 *l++ = *e++;
3006 }
3007 }
3008 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003009 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003010}
3011
Alexander Belopolsky40018472011-02-26 01:02:56 +00003012PyObject *
3013PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003014 Py_ssize_t size,
3015 const char *encoding,
3016 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003017{
3018 PyObject *buffer = NULL, *unicode;
3019 Py_buffer info;
3020 char lower[11]; /* Enough for any encoding shortcut */
3021
Fred Drakee4315f52000-05-09 19:53:39 +00003022 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003023 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003024 if ((strcmp(lower, "utf-8") == 0) ||
3025 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003026 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003027 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003028 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003029 (strcmp(lower, "iso-8859-1") == 0) ||
3030 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003031 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003032#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003033 else if (strcmp(lower, "mbcs") == 0)
3034 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003035#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003036 else if (strcmp(lower, "ascii") == 0)
3037 return PyUnicode_DecodeASCII(s, size, errors);
3038 else if (strcmp(lower, "utf-16") == 0)
3039 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3040 else if (strcmp(lower, "utf-32") == 0)
3041 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043
3044 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003045 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003046 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003047 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003048 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 if (buffer == NULL)
3050 goto onError;
3051 unicode = PyCodec_Decode(buffer, encoding, errors);
3052 if (unicode == NULL)
3053 goto onError;
3054 if (!PyUnicode_Check(unicode)) {
3055 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003056 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003057 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 Py_DECREF(unicode);
3059 goto onError;
3060 }
3061 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003062 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003063
Benjamin Peterson29060642009-01-31 22:14:21 +00003064 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 Py_XDECREF(buffer);
3066 return NULL;
3067}
3068
Alexander Belopolsky40018472011-02-26 01:02:56 +00003069PyObject *
3070PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003071 const char *encoding,
3072 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003073{
3074 PyObject *v;
3075
3076 if (!PyUnicode_Check(unicode)) {
3077 PyErr_BadArgument();
3078 goto onError;
3079 }
3080
3081 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003083
3084 /* Decode via the codec registry */
3085 v = PyCodec_Decode(unicode, encoding, errors);
3086 if (v == NULL)
3087 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003088 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003089
Benjamin Peterson29060642009-01-31 22:14:21 +00003090 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003091 return NULL;
3092}
3093
Alexander Belopolsky40018472011-02-26 01:02:56 +00003094PyObject *
3095PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003096 const char *encoding,
3097 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003098{
3099 PyObject *v;
3100
3101 if (!PyUnicode_Check(unicode)) {
3102 PyErr_BadArgument();
3103 goto onError;
3104 }
3105
3106 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003107 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003108
3109 /* Decode via the codec registry */
3110 v = PyCodec_Decode(unicode, encoding, errors);
3111 if (v == NULL)
3112 goto onError;
3113 if (!PyUnicode_Check(v)) {
3114 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003115 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003116 Py_TYPE(v)->tp_name);
3117 Py_DECREF(v);
3118 goto onError;
3119 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003120 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003121
Benjamin Peterson29060642009-01-31 22:14:21 +00003122 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003123 return NULL;
3124}
3125
Alexander Belopolsky40018472011-02-26 01:02:56 +00003126PyObject *
3127PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003128 Py_ssize_t size,
3129 const char *encoding,
3130 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131{
3132 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003133
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134 unicode = PyUnicode_FromUnicode(s, size);
3135 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3138 Py_DECREF(unicode);
3139 return v;
3140}
3141
Alexander Belopolsky40018472011-02-26 01:02:56 +00003142PyObject *
3143PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003144 const char *encoding,
3145 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003146{
3147 PyObject *v;
3148
3149 if (!PyUnicode_Check(unicode)) {
3150 PyErr_BadArgument();
3151 goto onError;
3152 }
3153
3154 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003156
3157 /* Encode via the codec registry */
3158 v = PyCodec_Encode(unicode, encoding, errors);
3159 if (v == NULL)
3160 goto onError;
3161 return v;
3162
Benjamin Peterson29060642009-01-31 22:14:21 +00003163 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003164 return NULL;
3165}
3166
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003167static size_t
3168wcstombs_errorpos(const wchar_t *wstr)
3169{
3170 size_t len;
3171#if SIZEOF_WCHAR_T == 2
3172 wchar_t buf[3];
3173#else
3174 wchar_t buf[2];
3175#endif
3176 char outbuf[MB_LEN_MAX];
3177 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003178
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003179#if SIZEOF_WCHAR_T == 2
3180 buf[2] = 0;
3181#else
3182 buf[1] = 0;
3183#endif
3184 start = wstr;
3185 while (*wstr != L'\0')
3186 {
3187 previous = wstr;
3188#if SIZEOF_WCHAR_T == 2
3189 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3190 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3191 {
3192 buf[0] = wstr[0];
3193 buf[1] = wstr[1];
3194 wstr += 2;
3195 }
3196 else {
3197 buf[0] = *wstr;
3198 buf[1] = 0;
3199 wstr++;
3200 }
3201#else
3202 buf[0] = *wstr;
3203 wstr++;
3204#endif
3205 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003206 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003207 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003208 }
3209
3210 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003211 return 0;
3212}
3213
Victor Stinner1b579672011-12-17 05:47:23 +01003214static int
3215locale_error_handler(const char *errors, int *surrogateescape)
3216{
3217 if (errors == NULL) {
3218 *surrogateescape = 0;
3219 return 0;
3220 }
3221
3222 if (strcmp(errors, "strict") == 0) {
3223 *surrogateescape = 0;
3224 return 0;
3225 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003226 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003227 *surrogateescape = 1;
3228 return 0;
3229 }
3230 PyErr_Format(PyExc_ValueError,
3231 "only 'strict' and 'surrogateescape' error handlers "
3232 "are supported, not '%s'",
3233 errors);
3234 return -1;
3235}
3236
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003237PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003238PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003239{
3240 Py_ssize_t wlen, wlen2;
3241 wchar_t *wstr;
3242 PyObject *bytes = NULL;
3243 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003244 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003245 PyObject *exc;
3246 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003247 int surrogateescape;
3248
3249 if (locale_error_handler(errors, &surrogateescape) < 0)
3250 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003251
3252 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3253 if (wstr == NULL)
3254 return NULL;
3255
3256 wlen2 = wcslen(wstr);
3257 if (wlen2 != wlen) {
3258 PyMem_Free(wstr);
3259 PyErr_SetString(PyExc_TypeError, "embedded null character");
3260 return NULL;
3261 }
3262
3263 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003264 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003265 char *str;
3266
3267 str = _Py_wchar2char(wstr, &error_pos);
3268 if (str == NULL) {
3269 if (error_pos == (size_t)-1) {
3270 PyErr_NoMemory();
3271 PyMem_Free(wstr);
3272 return NULL;
3273 }
3274 else {
3275 goto encode_error;
3276 }
3277 }
3278 PyMem_Free(wstr);
3279
3280 bytes = PyBytes_FromString(str);
3281 PyMem_Free(str);
3282 }
3283 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003284 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003285 size_t len, len2;
3286
3287 len = wcstombs(NULL, wstr, 0);
3288 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003289 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290 goto encode_error;
3291 }
3292
3293 bytes = PyBytes_FromStringAndSize(NULL, len);
3294 if (bytes == NULL) {
3295 PyMem_Free(wstr);
3296 return NULL;
3297 }
3298
3299 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3300 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003301 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003302 goto encode_error;
3303 }
3304 PyMem_Free(wstr);
3305 }
3306 return bytes;
3307
3308encode_error:
3309 errmsg = strerror(errno);
3310 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003311
3312 if (error_pos == (size_t)-1)
3313 error_pos = wcstombs_errorpos(wstr);
3314
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003315 PyMem_Free(wstr);
3316 Py_XDECREF(bytes);
3317
Victor Stinner2f197072011-12-17 07:08:30 +01003318 if (errmsg != NULL) {
3319 size_t errlen;
3320 wstr = _Py_char2wchar(errmsg, &errlen);
3321 if (wstr != NULL) {
3322 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003323 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003324 } else
3325 errmsg = NULL;
3326 }
3327 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003328 reason = PyUnicode_FromString(
3329 "wcstombs() encountered an unencodable "
3330 "wide character");
3331 if (reason == NULL)
3332 return NULL;
3333
3334 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3335 "locale", unicode,
3336 (Py_ssize_t)error_pos,
3337 (Py_ssize_t)(error_pos+1),
3338 reason);
3339 Py_DECREF(reason);
3340 if (exc != NULL) {
3341 PyCodec_StrictErrors(exc);
3342 Py_XDECREF(exc);
3343 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003344 return NULL;
3345}
3346
Victor Stinnerad158722010-10-27 00:25:46 +00003347PyObject *
3348PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003349{
Victor Stinner99b95382011-07-04 14:23:54 +02003350#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003351 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003352#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003353 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003354#else
Victor Stinner793b5312011-04-27 00:24:21 +02003355 PyInterpreterState *interp = PyThreadState_GET()->interp;
3356 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3357 cannot use it to encode and decode filenames before it is loaded. Load
3358 the Python codec requires to encode at least its own filename. Use the C
3359 version of the locale codec until the codec registry is initialized and
3360 the Python codec is loaded.
3361
3362 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3363 cannot only rely on it: check also interp->fscodec_initialized for
3364 subinterpreters. */
3365 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003366 return PyUnicode_AsEncodedString(unicode,
3367 Py_FileSystemDefaultEncoding,
3368 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003369 }
3370 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003371 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003372 }
Victor Stinnerad158722010-10-27 00:25:46 +00003373#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003374}
3375
Alexander Belopolsky40018472011-02-26 01:02:56 +00003376PyObject *
3377PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003378 const char *encoding,
3379 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380{
3381 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003382 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003383
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 if (!PyUnicode_Check(unicode)) {
3385 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387 }
Fred Drakee4315f52000-05-09 19:53:39 +00003388
Fred Drakee4315f52000-05-09 19:53:39 +00003389 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003390 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003391 if ((strcmp(lower, "utf-8") == 0) ||
3392 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003393 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003394 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003396 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003397 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003398 }
Victor Stinner37296e82010-06-10 13:36:23 +00003399 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003400 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003401 (strcmp(lower, "iso-8859-1") == 0) ||
3402 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003403 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003404#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003405 else if (strcmp(lower, "mbcs") == 0)
3406 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003407#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003408 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003409 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411
3412 /* Encode via the codec registry */
3413 v = PyCodec_Encode(unicode, encoding, errors);
3414 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003415 return NULL;
3416
3417 /* The normal path */
3418 if (PyBytes_Check(v))
3419 return v;
3420
3421 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003423 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003424 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003425
3426 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3427 "encoder %s returned bytearray instead of bytes",
3428 encoding);
3429 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003430 Py_DECREF(v);
3431 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003432 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003433
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003434 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3435 Py_DECREF(v);
3436 return b;
3437 }
3438
3439 PyErr_Format(PyExc_TypeError,
3440 "encoder did not return a bytes object (type=%.400s)",
3441 Py_TYPE(v)->tp_name);
3442 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003443 return NULL;
3444}
3445
Alexander Belopolsky40018472011-02-26 01:02:56 +00003446PyObject *
3447PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003448 const char *encoding,
3449 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003450{
3451 PyObject *v;
3452
3453 if (!PyUnicode_Check(unicode)) {
3454 PyErr_BadArgument();
3455 goto onError;
3456 }
3457
3458 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003459 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003460
3461 /* Encode via the codec registry */
3462 v = PyCodec_Encode(unicode, encoding, errors);
3463 if (v == NULL)
3464 goto onError;
3465 if (!PyUnicode_Check(v)) {
3466 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003467 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003468 Py_TYPE(v)->tp_name);
3469 Py_DECREF(v);
3470 goto onError;
3471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003473
Benjamin Peterson29060642009-01-31 22:14:21 +00003474 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 return NULL;
3476}
3477
Victor Stinner2f197072011-12-17 07:08:30 +01003478static size_t
3479mbstowcs_errorpos(const char *str, size_t len)
3480{
3481#ifdef HAVE_MBRTOWC
3482 const char *start = str;
3483 mbstate_t mbs;
3484 size_t converted;
3485 wchar_t ch;
3486
3487 memset(&mbs, 0, sizeof mbs);
3488 while (len)
3489 {
3490 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3491 if (converted == 0)
3492 /* Reached end of string */
3493 break;
3494 if (converted == (size_t)-1 || converted == (size_t)-2) {
3495 /* Conversion error or incomplete character */
3496 return str - start;
3497 }
3498 else {
3499 str += converted;
3500 len -= converted;
3501 }
3502 }
3503 /* failed to find the undecodable byte sequence */
3504 return 0;
3505#endif
3506 return 0;
3507}
3508
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003509PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003510PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003511 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003512{
3513 wchar_t smallbuf[256];
3514 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3515 wchar_t *wstr;
3516 size_t wlen, wlen2;
3517 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003518 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003519 size_t error_pos;
3520 char *errmsg;
3521 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003522
3523 if (locale_error_handler(errors, &surrogateescape) < 0)
3524 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003525
3526 if (str[len] != '\0' || len != strlen(str)) {
3527 PyErr_SetString(PyExc_TypeError, "embedded null character");
3528 return NULL;
3529 }
3530
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003531 if (surrogateescape) {
3532 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003533 wstr = _Py_char2wchar(str, &wlen);
3534 if (wstr == NULL) {
3535 if (wlen == (size_t)-1)
3536 PyErr_NoMemory();
3537 else
3538 PyErr_SetFromErrno(PyExc_OSError);
3539 return NULL;
3540 }
3541
3542 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003543 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003544 }
3545 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003546 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547#ifndef HAVE_BROKEN_MBSTOWCS
3548 wlen = mbstowcs(NULL, str, 0);
3549#else
3550 wlen = len;
3551#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003552 if (wlen == (size_t)-1)
3553 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003554 if (wlen+1 <= smallbuf_len) {
3555 wstr = smallbuf;
3556 }
3557 else {
3558 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3559 return PyErr_NoMemory();
3560
3561 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3562 if (!wstr)
3563 return PyErr_NoMemory();
3564 }
3565
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003566 wlen2 = mbstowcs(wstr, str, wlen+1);
3567 if (wlen2 == (size_t)-1) {
3568 if (wstr != smallbuf)
3569 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003570 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003571 }
3572#ifdef HAVE_BROKEN_MBSTOWCS
3573 assert(wlen2 == wlen);
3574#endif
3575 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3576 if (wstr != smallbuf)
3577 PyMem_Free(wstr);
3578 }
3579 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003580
3581decode_error:
3582 errmsg = strerror(errno);
3583 assert(errmsg != NULL);
3584
3585 error_pos = mbstowcs_errorpos(str, len);
3586 if (errmsg != NULL) {
3587 size_t errlen;
3588 wstr = _Py_char2wchar(errmsg, &errlen);
3589 if (wstr != NULL) {
3590 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003591 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003592 } else
3593 errmsg = NULL;
3594 }
3595 if (errmsg == NULL)
3596 reason = PyUnicode_FromString(
3597 "mbstowcs() encountered an invalid multibyte sequence");
3598 if (reason == NULL)
3599 return NULL;
3600
3601 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3602 "locale", str, len,
3603 (Py_ssize_t)error_pos,
3604 (Py_ssize_t)(error_pos+1),
3605 reason);
3606 Py_DECREF(reason);
3607 if (exc != NULL) {
3608 PyCodec_StrictErrors(exc);
3609 Py_XDECREF(exc);
3610 }
3611 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003612}
3613
3614PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003615PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003616{
3617 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003618 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003619}
3620
3621
3622PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003623PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003624 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003625 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3626}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003627
Christian Heimes5894ba72007-11-04 11:43:14 +00003628PyObject*
3629PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3630{
Victor Stinner99b95382011-07-04 14:23:54 +02003631#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003632 return PyUnicode_DecodeMBCS(s, size, NULL);
3633#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003634 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003635#else
Victor Stinner793b5312011-04-27 00:24:21 +02003636 PyInterpreterState *interp = PyThreadState_GET()->interp;
3637 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3638 cannot use it to encode and decode filenames before it is loaded. Load
3639 the Python codec requires to encode at least its own filename. Use the C
3640 version of the locale codec until the codec registry is initialized and
3641 the Python codec is loaded.
3642
3643 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3644 cannot only rely on it: check also interp->fscodec_initialized for
3645 subinterpreters. */
3646 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003647 return PyUnicode_Decode(s, size,
3648 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003649 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003650 }
3651 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003652 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003653 }
Victor Stinnerad158722010-10-27 00:25:46 +00003654#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003655}
3656
Martin v. Löwis011e8422009-05-05 04:43:17 +00003657
3658int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003659_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003660{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003661 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003662
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003663 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003664 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003665 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3666 PyUnicode_GET_LENGTH(str), '\0', 1);
3667 if (pos == -1)
3668 return 0;
3669 else
3670 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003671}
3672
Antoine Pitrou13348842012-01-29 18:36:34 +01003673int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003674PyUnicode_FSConverter(PyObject* arg, void* addr)
3675{
3676 PyObject *output = NULL;
3677 Py_ssize_t size;
3678 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003679 if (arg == NULL) {
3680 Py_DECREF(*(PyObject**)addr);
3681 return 1;
3682 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003683 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003684 output = arg;
3685 Py_INCREF(output);
3686 }
3687 else {
3688 arg = PyUnicode_FromObject(arg);
3689 if (!arg)
3690 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003691 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003692 Py_DECREF(arg);
3693 if (!output)
3694 return 0;
3695 if (!PyBytes_Check(output)) {
3696 Py_DECREF(output);
3697 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3698 return 0;
3699 }
3700 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003701 size = PyBytes_GET_SIZE(output);
3702 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003703 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003704 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003705 Py_DECREF(output);
3706 return 0;
3707 }
3708 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003709 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003710}
3711
3712
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003713int
3714PyUnicode_FSDecoder(PyObject* arg, void* addr)
3715{
3716 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003717 if (arg == NULL) {
3718 Py_DECREF(*(PyObject**)addr);
3719 return 1;
3720 }
3721 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003722 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003723 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003724 output = arg;
3725 Py_INCREF(output);
3726 }
3727 else {
3728 arg = PyBytes_FromObject(arg);
3729 if (!arg)
3730 return 0;
3731 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3732 PyBytes_GET_SIZE(arg));
3733 Py_DECREF(arg);
3734 if (!output)
3735 return 0;
3736 if (!PyUnicode_Check(output)) {
3737 Py_DECREF(output);
3738 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3739 return 0;
3740 }
3741 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003742 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003743 Py_DECREF(output);
3744 return 0;
3745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003746 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003747 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003748 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3749 Py_DECREF(output);
3750 return 0;
3751 }
3752 *(PyObject**)addr = output;
3753 return Py_CLEANUP_SUPPORTED;
3754}
3755
3756
Martin v. Löwis5b222132007-06-10 09:51:05 +00003757char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003759{
Christian Heimesf3863112007-11-22 07:46:41 +00003760 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003762 if (!PyUnicode_Check(unicode)) {
3763 PyErr_BadArgument();
3764 return NULL;
3765 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003766 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003767 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003769 if (PyUnicode_UTF8(unicode) == NULL) {
3770 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3772 if (bytes == NULL)
3773 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3775 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003776 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 Py_DECREF(bytes);
3778 return NULL;
3779 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003780 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3781 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3782 PyBytes_AS_STRING(bytes),
3783 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784 Py_DECREF(bytes);
3785 }
3786
3787 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003788 *psize = PyUnicode_UTF8_LENGTH(unicode);
3789 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003790}
3791
3792char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3796}
3797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003798Py_UNICODE *
3799PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801 const unsigned char *one_byte;
3802#if SIZEOF_WCHAR_T == 4
3803 const Py_UCS2 *two_bytes;
3804#else
3805 const Py_UCS4 *four_bytes;
3806 const Py_UCS4 *ucs4_end;
3807 Py_ssize_t num_surrogates;
3808#endif
3809 wchar_t *w;
3810 wchar_t *wchar_end;
3811
3812 if (!PyUnicode_Check(unicode)) {
3813 PyErr_BadArgument();
3814 return NULL;
3815 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003816 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 assert(_PyUnicode_KIND(unicode) != 0);
3819 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003821 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003823 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3824 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 num_surrogates = 0;
3826
3827 for (; four_bytes < ucs4_end; ++four_bytes) {
3828 if (*four_bytes > 0xFFFF)
3829 ++num_surrogates;
3830 }
3831
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3833 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3834 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 PyErr_NoMemory();
3836 return NULL;
3837 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003838 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 w = _PyUnicode_WSTR(unicode);
3841 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3842 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3844 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003845 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003847 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3848 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 }
3850 else
3851 *w = *four_bytes;
3852
3853 if (w > wchar_end) {
3854 assert(0 && "Miscalculated string end");
3855 }
3856 }
3857 *w = 0;
3858#else
3859 /* sizeof(wchar_t) == 4 */
3860 Py_FatalError("Impossible unicode object state, wstr and str "
3861 "should share memory already.");
3862 return NULL;
3863#endif
3864 }
3865 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3867 (_PyUnicode_LENGTH(unicode) + 1));
3868 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 PyErr_NoMemory();
3870 return NULL;
3871 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003872 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3873 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3874 w = _PyUnicode_WSTR(unicode);
3875 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003877 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3878 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 for (; w < wchar_end; ++one_byte, ++w)
3880 *w = *one_byte;
3881 /* null-terminate the wstr */
3882 *w = 0;
3883 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003884 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003886 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 for (; w < wchar_end; ++two_bytes, ++w)
3888 *w = *two_bytes;
3889 /* null-terminate the wstr */
3890 *w = 0;
3891#else
3892 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003893 PyObject_FREE(_PyUnicode_WSTR(unicode));
3894 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 Py_FatalError("Impossible unicode object state, wstr "
3896 "and str should share memory already.");
3897 return NULL;
3898#endif
3899 }
3900 else {
3901 assert(0 && "This should never happen.");
3902 }
3903 }
3904 }
3905 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003906 *size = PyUnicode_WSTR_LENGTH(unicode);
3907 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003908}
3909
Alexander Belopolsky40018472011-02-26 01:02:56 +00003910Py_UNICODE *
3911PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914}
3915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916
Alexander Belopolsky40018472011-02-26 01:02:56 +00003917Py_ssize_t
3918PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919{
3920 if (!PyUnicode_Check(unicode)) {
3921 PyErr_BadArgument();
3922 goto onError;
3923 }
3924 return PyUnicode_GET_SIZE(unicode);
3925
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 return -1;
3928}
3929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003930Py_ssize_t
3931PyUnicode_GetLength(PyObject *unicode)
3932{
Victor Stinner07621332012-06-16 04:53:46 +02003933 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 PyErr_BadArgument();
3935 return -1;
3936 }
Victor Stinner07621332012-06-16 04:53:46 +02003937 if (PyUnicode_READY(unicode) == -1)
3938 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939 return PyUnicode_GET_LENGTH(unicode);
3940}
3941
3942Py_UCS4
3943PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3944{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003945 void *data;
3946 int kind;
3947
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003948 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3949 PyErr_BadArgument();
3950 return (Py_UCS4)-1;
3951 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003952 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003953 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954 return (Py_UCS4)-1;
3955 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003956 data = PyUnicode_DATA(unicode);
3957 kind = PyUnicode_KIND(unicode);
3958 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959}
3960
3961int
3962PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3963{
3964 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003965 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966 return -1;
3967 }
Victor Stinner488fa492011-12-12 00:01:39 +01003968 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003969 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003970 PyErr_SetString(PyExc_IndexError, "string index out of range");
3971 return -1;
3972 }
Victor Stinner488fa492011-12-12 00:01:39 +01003973 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003974 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003975 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3976 PyErr_SetString(PyExc_ValueError, "character out of range");
3977 return -1;
3978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3980 index, ch);
3981 return 0;
3982}
3983
Alexander Belopolsky40018472011-02-26 01:02:56 +00003984const char *
3985PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003986{
Victor Stinner42cb4622010-09-01 19:39:01 +00003987 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003988}
3989
Victor Stinner554f3f02010-06-16 23:33:54 +00003990/* create or adjust a UnicodeDecodeError */
3991static void
3992make_decode_exception(PyObject **exceptionObject,
3993 const char *encoding,
3994 const char *input, Py_ssize_t length,
3995 Py_ssize_t startpos, Py_ssize_t endpos,
3996 const char *reason)
3997{
3998 if (*exceptionObject == NULL) {
3999 *exceptionObject = PyUnicodeDecodeError_Create(
4000 encoding, input, length, startpos, endpos, reason);
4001 }
4002 else {
4003 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4004 goto onError;
4005 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4006 goto onError;
4007 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4008 goto onError;
4009 }
4010 return;
4011
4012onError:
4013 Py_DECREF(*exceptionObject);
4014 *exceptionObject = NULL;
4015}
4016
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004017#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018/* error handling callback helper:
4019 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004020 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 and adjust various state variables.
4022 return 0 on success, -1 on error
4023*/
4024
Alexander Belopolsky40018472011-02-26 01:02:56 +00004025static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004026unicode_decode_call_errorhandler_wchar(
4027 const char *errors, PyObject **errorHandler,
4028 const char *encoding, const char *reason,
4029 const char **input, const char **inend, Py_ssize_t *startinpos,
4030 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4031 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004033 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034
4035 PyObject *restuple = NULL;
4036 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004037 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004038 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004039 Py_ssize_t requiredsize;
4040 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004041 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004042 wchar_t *repwstr;
4043 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004045 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4046 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 *errorHandler = PyCodec_LookupError(errors);
4050 if (*errorHandler == NULL)
4051 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 }
4053
Victor Stinner554f3f02010-06-16 23:33:54 +00004054 make_decode_exception(exceptionObject,
4055 encoding,
4056 *input, *inend - *input,
4057 *startinpos, *endinpos,
4058 reason);
4059 if (*exceptionObject == NULL)
4060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061
4062 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4063 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004066 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 }
4069 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004071
4072 /* Copy back the bytes variables, which might have been modified by the
4073 callback */
4074 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4075 if (!inputobj)
4076 goto onError;
4077 if (!PyBytes_Check(inputobj)) {
4078 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4079 }
4080 *input = PyBytes_AS_STRING(inputobj);
4081 insize = PyBytes_GET_SIZE(inputobj);
4082 *inend = *input + insize;
4083 /* we can DECREF safely, as the exception has another reference,
4084 so the object won't go away. */
4085 Py_DECREF(inputobj);
4086
4087 if (newpos<0)
4088 newpos = insize+newpos;
4089 if (newpos<0 || newpos>insize) {
4090 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4091 goto onError;
4092 }
4093
4094 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4095 if (repwstr == NULL)
4096 goto onError;
4097 /* need more space? (at least enough for what we
4098 have+the replacement+the rest of the string (starting
4099 at the new input position), so we won't have to check space
4100 when there are no errors in the rest of the string) */
4101 requiredsize = *outpos + repwlen + insize-newpos;
4102 if (requiredsize > outsize) {
4103 if (requiredsize < 2*outsize)
4104 requiredsize = 2*outsize;
4105 if (unicode_resize(output, requiredsize) < 0)
4106 goto onError;
4107 }
4108 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4109 *outpos += repwlen;
4110
4111 *endinpos = newpos;
4112 *inptr = *input + newpos;
4113
4114 /* we made it! */
4115 Py_XDECREF(restuple);
4116 return 0;
4117
4118 onError:
4119 Py_XDECREF(restuple);
4120 return -1;
4121}
4122#endif /* HAVE_MBCS */
4123
4124static int
4125unicode_decode_call_errorhandler_writer(
4126 const char *errors, PyObject **errorHandler,
4127 const char *encoding, const char *reason,
4128 const char **input, const char **inend, Py_ssize_t *startinpos,
4129 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4130 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4131{
4132 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4133
4134 PyObject *restuple = NULL;
4135 PyObject *repunicode = NULL;
4136 Py_ssize_t insize;
4137 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004138 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004139 PyObject *inputobj = NULL;
4140
4141 if (*errorHandler == NULL) {
4142 *errorHandler = PyCodec_LookupError(errors);
4143 if (*errorHandler == NULL)
4144 goto onError;
4145 }
4146
4147 make_decode_exception(exceptionObject,
4148 encoding,
4149 *input, *inend - *input,
4150 *startinpos, *endinpos,
4151 reason);
4152 if (*exceptionObject == NULL)
4153 goto onError;
4154
4155 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4156 if (restuple == NULL)
4157 goto onError;
4158 if (!PyTuple_Check(restuple)) {
4159 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4160 goto onError;
4161 }
4162 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004163 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004164
4165 /* Copy back the bytes variables, which might have been modified by the
4166 callback */
4167 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4168 if (!inputobj)
4169 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004170 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004171 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004172 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004173 *input = PyBytes_AS_STRING(inputobj);
4174 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004175 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004176 /* we can DECREF safely, as the exception has another reference,
4177 so the object won't go away. */
4178 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004179
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004182 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4184 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004185 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186
Victor Stinner8f674cc2013-04-17 23:02:17 +02004187 if (PyUnicode_READY(repunicode) < 0)
4188 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004189 replen = PyUnicode_GET_LENGTH(repunicode);
4190 writer->min_length += replen;
4191 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004192 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004194 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004195
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004197 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004198
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004200 Py_XDECREF(restuple);
4201 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004205 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206}
4207
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004208/* --- UTF-7 Codec -------------------------------------------------------- */
4209
Antoine Pitrou244651a2009-05-04 18:56:13 +00004210/* See RFC2152 for details. We encode conservatively and decode liberally. */
4211
4212/* Three simple macros defining base-64. */
4213
4214/* Is c a base-64 character? */
4215
4216#define IS_BASE64(c) \
4217 (((c) >= 'A' && (c) <= 'Z') || \
4218 ((c) >= 'a' && (c) <= 'z') || \
4219 ((c) >= '0' && (c) <= '9') || \
4220 (c) == '+' || (c) == '/')
4221
4222/* given that c is a base-64 character, what is its base-64 value? */
4223
4224#define FROM_BASE64(c) \
4225 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4226 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4227 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4228 (c) == '+' ? 62 : 63)
4229
4230/* What is the base-64 character of the bottom 6 bits of n? */
4231
4232#define TO_BASE64(n) \
4233 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4234
4235/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4236 * decoded as itself. We are permissive on decoding; the only ASCII
4237 * byte not decoding to itself is the + which begins a base64
4238 * string. */
4239
4240#define DECODE_DIRECT(c) \
4241 ((c) <= 127 && (c) != '+')
4242
4243/* The UTF-7 encoder treats ASCII characters differently according to
4244 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4245 * the above). See RFC2152. This array identifies these different
4246 * sets:
4247 * 0 : "Set D"
4248 * alphanumeric and '(),-./:?
4249 * 1 : "Set O"
4250 * !"#$%&*;<=>@[]^_`{|}
4251 * 2 : "whitespace"
4252 * ht nl cr sp
4253 * 3 : special (must be base64 encoded)
4254 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4255 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256
Tim Petersced69f82003-09-16 20:30:58 +00004257static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004258char utf7_category[128] = {
4259/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4260 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4261/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4262 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4263/* sp ! " # $ % & ' ( ) * + , - . / */
4264 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4265/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4267/* @ A B C D E F G H I J K L M N O */
4268 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4269/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4271/* ` a b c d e f g h i j k l m n o */
4272 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4273/* p q r s t u v w x y z { | } ~ del */
4274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004275};
4276
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277/* ENCODE_DIRECT: this character should be encoded as itself. The
4278 * answer depends on whether we are encoding set O as itself, and also
4279 * on whether we are encoding whitespace as itself. RFC2152 makes it
4280 * clear that the answers to these questions vary between
4281 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004282
Antoine Pitrou244651a2009-05-04 18:56:13 +00004283#define ENCODE_DIRECT(c, directO, directWS) \
4284 ((c) < 128 && (c) > 0 && \
4285 ((utf7_category[(c)] == 0) || \
4286 (directWS && (utf7_category[(c)] == 2)) || \
4287 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288
Alexander Belopolsky40018472011-02-26 01:02:56 +00004289PyObject *
4290PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004291 Py_ssize_t size,
4292 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004294 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4295}
4296
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297/* The decoder. The only state we preserve is our read position,
4298 * i.e. how many characters we have consumed. So if we end in the
4299 * middle of a shift sequence we have to back off the read position
4300 * and the output to the beginning of the sequence, otherwise we lose
4301 * all the shift state (seen bits, number of bits seen, high
4302 * surrogate). */
4303
Alexander Belopolsky40018472011-02-26 01:02:56 +00004304PyObject *
4305PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004306 Py_ssize_t size,
4307 const char *errors,
4308 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004311 Py_ssize_t startinpos;
4312 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004315 const char *errmsg = "";
4316 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004317 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 unsigned int base64bits = 0;
4319 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004320 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 PyObject *errorHandler = NULL;
4322 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004324 if (size == 0) {
4325 if (consumed)
4326 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004327 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004328 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004329
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004331 _PyUnicodeWriter_Init(&writer);
4332 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333
4334 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004335 e = s + size;
4336
4337 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004338 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004340 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004341
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 if (inShift) { /* in a base-64 section */
4343 if (IS_BASE64(ch)) { /* consume a base-64 character */
4344 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4345 base64bits += 6;
4346 s++;
4347 if (base64bits >= 16) {
4348 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004349 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 base64bits -= 16;
4351 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004352 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 if (surrogate) {
4354 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004355 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4356 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004357 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004358 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004360 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 }
4362 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004363 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004364 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 }
4367 }
Victor Stinner551ac952011-11-29 22:58:13 +01004368 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 /* first surrogate */
4370 surrogate = outCh;
4371 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004373 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004374 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 }
4376 }
4377 }
4378 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379 inShift = 0;
4380 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004382 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004383 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004384 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 if (base64bits > 0) { /* left-over bits */
4387 if (base64bits >= 6) {
4388 /* We've seen at least one base-64 character */
4389 errmsg = "partial character in shift sequence";
4390 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004391 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392 else {
4393 /* Some bits remain; they should be zero */
4394 if (base64buffer != 0) {
4395 errmsg = "non-zero padding bits in shift sequence";
4396 goto utf7Error;
4397 }
4398 }
4399 }
4400 if (ch != '-') {
4401 /* '-' is absorbed; other terminating
4402 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004403 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004404 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406 }
4407 }
4408 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 s++; /* consume '+' */
4411 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004413 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 }
4416 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004420 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 }
4422 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004425 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004426 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 else {
4429 startinpos = s-starts;
4430 s++;
4431 errmsg = "unexpected special character";
4432 goto utf7Error;
4433 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004437 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 errors, &errorHandler,
4439 "utf7", errmsg,
4440 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004441 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443 }
4444
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 /* end of string */
4446
4447 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4448 /* if we're in an inconsistent state, that's an error */
4449 if (surrogate ||
4450 (base64bits >= 6) ||
4451 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004453 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004454 errors, &errorHandler,
4455 "utf7", "unterminated shift sequence",
4456 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004457 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 goto onError;
4459 if (s < e)
4460 goto restart;
4461 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463
4464 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004465 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004467 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004468 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 }
4470 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004471 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004473 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 Py_XDECREF(errorHandler);
4476 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 Py_XDECREF(errorHandler);
4481 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004482 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483 return NULL;
4484}
4485
4486
Alexander Belopolsky40018472011-02-26 01:02:56 +00004487PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004488_PyUnicode_EncodeUTF7(PyObject *str,
4489 int base64SetO,
4490 int base64WhiteSpace,
4491 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004493 int kind;
4494 void *data;
4495 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004496 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004497 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004498 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 unsigned int base64bits = 0;
4500 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501 char * out;
4502 char * start;
4503
Benjamin Petersonbac79492012-01-14 13:34:47 -05004504 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004505 return NULL;
4506 kind = PyUnicode_KIND(str);
4507 data = PyUnicode_DATA(str);
4508 len = PyUnicode_GET_LENGTH(str);
4509
4510 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004513 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004514 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004515 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004516 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517 if (v == NULL)
4518 return NULL;
4519
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004520 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004521 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004522 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 if (inShift) {
4525 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4526 /* shifting out */
4527 if (base64bits) { /* output remaining bits */
4528 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4529 base64buffer = 0;
4530 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531 }
4532 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 /* Characters not in the BASE64 set implicitly unshift the sequence
4534 so no '-' is required, except if the character is itself a '-' */
4535 if (IS_BASE64(ch) || ch == '-') {
4536 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 *out++ = (char) ch;
4539 }
4540 else {
4541 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004542 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 else { /* not in a shift sequence */
4545 if (ch == '+') {
4546 *out++ = '+';
4547 *out++ = '-';
4548 }
4549 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4550 *out++ = (char) ch;
4551 }
4552 else {
4553 *out++ = '+';
4554 inShift = 1;
4555 goto encode_char;
4556 }
4557 }
4558 continue;
4559encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004561 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004562
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 /* code first surrogate */
4564 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004565 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 while (base64bits >= 6) {
4567 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4568 base64bits -= 6;
4569 }
4570 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004571 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 base64bits += 16;
4574 base64buffer = (base64buffer << 16) | ch;
4575 while (base64bits >= 6) {
4576 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4577 base64bits -= 6;
4578 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 if (base64bits)
4581 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4582 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004584 if (_PyBytes_Resize(&v, out - start) < 0)
4585 return NULL;
4586 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004588PyObject *
4589PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4590 Py_ssize_t size,
4591 int base64SetO,
4592 int base64WhiteSpace,
4593 const char *errors)
4594{
4595 PyObject *result;
4596 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4597 if (tmp == NULL)
4598 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004599 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004600 base64WhiteSpace, errors);
4601 Py_DECREF(tmp);
4602 return result;
4603}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605#undef IS_BASE64
4606#undef FROM_BASE64
4607#undef TO_BASE64
4608#undef DECODE_DIRECT
4609#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611/* --- UTF-8 Codec -------------------------------------------------------- */
4612
Alexander Belopolsky40018472011-02-26 01:02:56 +00004613PyObject *
4614PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004615 Py_ssize_t size,
4616 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617{
Walter Dörwald69652032004-09-07 20:24:22 +00004618 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4619}
4620
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004621#include "stringlib/asciilib.h"
4622#include "stringlib/codecs.h"
4623#include "stringlib/undef.h"
4624
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004625#include "stringlib/ucs1lib.h"
4626#include "stringlib/codecs.h"
4627#include "stringlib/undef.h"
4628
4629#include "stringlib/ucs2lib.h"
4630#include "stringlib/codecs.h"
4631#include "stringlib/undef.h"
4632
4633#include "stringlib/ucs4lib.h"
4634#include "stringlib/codecs.h"
4635#include "stringlib/undef.h"
4636
Antoine Pitrouab868312009-01-10 15:40:25 +00004637/* Mask to quickly check whether a C 'long' contains a
4638 non-ASCII, UTF8-encoded char. */
4639#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004640# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004641#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004642# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004643#else
4644# error C 'long' size should be either 4 or 8!
4645#endif
4646
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004647static Py_ssize_t
4648ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004649{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004650 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004651 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004652
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004653 /*
4654 * Issue #17237: m68k is a bit different from most architectures in
4655 * that objects do not use "natural alignment" - for example, int and
4656 * long are only aligned at 2-byte boundaries. Therefore the assert()
4657 * won't work; also, tests have shown that skipping the "optimised
4658 * version" will even speed up m68k.
4659 */
4660#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004661#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004662 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4663 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664 /* Fast path, see in STRINGLIB(utf8_decode) for
4665 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004666 /* Help allocation */
4667 const char *_p = p;
4668 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004669 while (_p < aligned_end) {
4670 unsigned long value = *(const unsigned long *) _p;
4671 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673 *((unsigned long *)q) = value;
4674 _p += SIZEOF_LONG;
4675 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004676 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 p = _p;
4678 while (p < end) {
4679 if ((unsigned char)*p & 0x80)
4680 break;
4681 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004686#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004687 while (p < end) {
4688 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4689 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004690 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004691 /* Help allocation */
4692 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 while (_p < aligned_end) {
4694 unsigned long value = *(unsigned long *) _p;
4695 if (value & ASCII_CHAR_MASK)
4696 break;
4697 _p += SIZEOF_LONG;
4698 }
4699 p = _p;
4700 if (_p == end)
4701 break;
4702 }
4703 if ((unsigned char)*p & 0x80)
4704 break;
4705 ++p;
4706 }
4707 memcpy(dest, start, p - start);
4708 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709}
Antoine Pitrouab868312009-01-10 15:40:25 +00004710
Victor Stinner785938e2011-12-11 20:09:03 +01004711PyObject *
4712PyUnicode_DecodeUTF8Stateful(const char *s,
4713 Py_ssize_t size,
4714 const char *errors,
4715 Py_ssize_t *consumed)
4716{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004717 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004718 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004719 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720
4721 Py_ssize_t startinpos;
4722 Py_ssize_t endinpos;
4723 const char *errmsg = "";
4724 PyObject *errorHandler = NULL;
4725 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004726
4727 if (size == 0) {
4728 if (consumed)
4729 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004730 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004731 }
4732
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004733 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4734 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004735 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 *consumed = 1;
4737 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004738 }
4739
Victor Stinner8f674cc2013-04-17 23:02:17 +02004740 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004741 writer.min_length = size;
4742 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004743 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004744
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004745 writer.pos = ascii_decode(s, end, writer.data);
4746 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004747 while (s < end) {
4748 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004749 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004750 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004751 if (PyUnicode_IS_ASCII(writer.buffer))
4752 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004754 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004755 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004756 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 } else {
4758 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004759 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 }
4761
4762 switch (ch) {
4763 case 0:
4764 if (s == end || consumed)
4765 goto End;
4766 errmsg = "unexpected end of data";
4767 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004768 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004769 break;
4770 case 1:
4771 errmsg = "invalid start byte";
4772 startinpos = s - starts;
4773 endinpos = startinpos + 1;
4774 break;
4775 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004776 case 3:
4777 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 errmsg = "invalid continuation byte";
4779 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004780 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004781 break;
4782 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004783 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 goto onError;
4785 continue;
4786 }
4787
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004788 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004789 errors, &errorHandler,
4790 "utf-8", errmsg,
4791 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004792 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004793 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004794 }
4795
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004796End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797 if (consumed)
4798 *consumed = s - starts;
4799
4800 Py_XDECREF(errorHandler);
4801 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004802 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803
4804onError:
4805 Py_XDECREF(errorHandler);
4806 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004807 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004808 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004809}
4810
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004811#ifdef __APPLE__
4812
4813/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004814 used to decode the command line arguments on Mac OS X.
4815
4816 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004817 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004818
4819wchar_t*
4820_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4821{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004822 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 wchar_t *unicode;
4824 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004825
4826 /* Note: size will always be longer than the resulting Unicode
4827 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004828 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004830 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004831 if (!unicode)
4832 return NULL;
4833
4834 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004836 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004837 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004839#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004840 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004842 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004843#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004844 if (ch > 0xFF) {
4845#if SIZEOF_WCHAR_T == 4
4846 assert(0);
4847#else
4848 assert(Py_UNICODE_IS_SURROGATE(ch));
4849 /* compute and append the two surrogates: */
4850 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4851 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4852#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004853 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004854 else {
4855 if (!ch && s == e)
4856 break;
4857 /* surrogateescape */
4858 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4859 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004860 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004861 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004862 return unicode;
4863}
4864
4865#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004867/* Primary internal function which creates utf8 encoded bytes objects.
4868
4869 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004870 and allocate exactly as much space needed at the end. Else allocate the
4871 maximum possible needed (4 result bytes per Unicode character), and return
4872 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004873*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004874PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004875_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876{
Victor Stinner6099a032011-12-18 14:22:26 +01004877 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004878 void *data;
4879 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004881 if (!PyUnicode_Check(unicode)) {
4882 PyErr_BadArgument();
4883 return NULL;
4884 }
4885
4886 if (PyUnicode_READY(unicode) == -1)
4887 return NULL;
4888
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004889 if (PyUnicode_UTF8(unicode))
4890 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4891 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004892
4893 kind = PyUnicode_KIND(unicode);
4894 data = PyUnicode_DATA(unicode);
4895 size = PyUnicode_GET_LENGTH(unicode);
4896
Benjamin Petersonead6b532011-12-20 17:23:42 -06004897 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004898 default:
4899 assert(0);
4900 case PyUnicode_1BYTE_KIND:
4901 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4902 assert(!PyUnicode_IS_ASCII(unicode));
4903 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4904 case PyUnicode_2BYTE_KIND:
4905 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4906 case PyUnicode_4BYTE_KIND:
4907 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909}
4910
Alexander Belopolsky40018472011-02-26 01:02:56 +00004911PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004912PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4913 Py_ssize_t size,
4914 const char *errors)
4915{
4916 PyObject *v, *unicode;
4917
4918 unicode = PyUnicode_FromUnicode(s, size);
4919 if (unicode == NULL)
4920 return NULL;
4921 v = _PyUnicode_AsUTF8String(unicode, errors);
4922 Py_DECREF(unicode);
4923 return v;
4924}
4925
4926PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004927PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004929 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930}
4931
Walter Dörwald41980ca2007-08-16 21:55:45 +00004932/* --- UTF-32 Codec ------------------------------------------------------- */
4933
4934PyObject *
4935PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 Py_ssize_t size,
4937 const char *errors,
4938 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939{
4940 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4941}
4942
4943PyObject *
4944PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 Py_ssize_t size,
4946 const char *errors,
4947 int *byteorder,
4948 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949{
4950 const char *starts = s;
4951 Py_ssize_t startinpos;
4952 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004953 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004954 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004955 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957 PyObject *errorHandler = NULL;
4958 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004959
Walter Dörwald41980ca2007-08-16 21:55:45 +00004960 q = (unsigned char *)s;
4961 e = q + size;
4962
4963 if (byteorder)
4964 bo = *byteorder;
4965
4966 /* Check for BOM marks (U+FEFF) in the input and adjust current
4967 byte order setting accordingly. In native mode, the leading BOM
4968 mark is skipped, in all other modes, it is copied to the output
4969 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004970 if (bo == 0 && size >= 4) {
4971 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4972 if (bom == 0x0000FEFF) {
4973 bo = -1;
4974 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004976 else if (bom == 0xFFFE0000) {
4977 bo = 1;
4978 q += 4;
4979 }
4980 if (byteorder)
4981 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982 }
4983
Victor Stinnere64322e2012-10-30 23:12:47 +01004984 if (q == e) {
4985 if (consumed)
4986 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004987 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004988 }
4989
Victor Stinnere64322e2012-10-30 23:12:47 +01004990#ifdef WORDS_BIGENDIAN
4991 le = bo < 0;
4992#else
4993 le = bo <= 0;
4994#endif
4995
Victor Stinner8f674cc2013-04-17 23:02:17 +02004996 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004997 writer.min_length = (e - q + 3) / 4;
4998 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005000
Victor Stinnere64322e2012-10-30 23:12:47 +01005001 while (1) {
5002 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005004
Victor Stinnere64322e2012-10-30 23:12:47 +01005005 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 enum PyUnicode_Kind kind = writer.kind;
5007 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005010 if (le) {
5011 do {
5012 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5013 if (ch > maxch)
5014 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005015 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005016 q += 4;
5017 } while (q <= last);
5018 }
5019 else {
5020 do {
5021 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5022 if (ch > maxch)
5023 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005024 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005025 q += 4;
5026 } while (q <= last);
5027 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005028 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005029 }
5030
5031 if (ch <= maxch) {
5032 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005036 startinpos = ((const char *)q) - starts;
5037 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005039 else {
5040 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005041 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005042 goto onError;
5043 q += 4;
5044 continue;
5045 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005047 startinpos = ((const char *)q) - starts;
5048 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005050
5051 /* The remaining input chars are ignored if the callback
5052 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005053 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 errors, &errorHandler,
5055 "utf32", errmsg,
5056 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005057 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059 }
5060
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063
Walter Dörwald41980ca2007-08-16 21:55:45 +00005064 Py_XDECREF(errorHandler);
5065 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005066 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005067
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005069 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070 Py_XDECREF(errorHandler);
5071 Py_XDECREF(exc);
5072 return NULL;
5073}
5074
5075PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005076_PyUnicode_EncodeUTF32(PyObject *str,
5077 const char *errors,
5078 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005080 int kind;
5081 void *data;
5082 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005083 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005085 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005087#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005088 int iorder[] = {0, 1, 2, 3};
5089#else
5090 int iorder[] = {3, 2, 1, 0};
5091#endif
5092
Benjamin Peterson29060642009-01-31 22:14:21 +00005093#define STORECHAR(CH) \
5094 do { \
5095 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5096 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5097 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5098 p[iorder[0]] = (CH) & 0xff; \
5099 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100 } while(0)
5101
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005102 if (!PyUnicode_Check(str)) {
5103 PyErr_BadArgument();
5104 return NULL;
5105 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005106 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005107 return NULL;
5108 kind = PyUnicode_KIND(str);
5109 data = PyUnicode_DATA(str);
5110 len = PyUnicode_GET_LENGTH(str);
5111
5112 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005113 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005115 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116 if (v == NULL)
5117 return NULL;
5118
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005119 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005122 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005123 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005124
5125 if (byteorder == -1) {
5126 /* force LE */
5127 iorder[0] = 0;
5128 iorder[1] = 1;
5129 iorder[2] = 2;
5130 iorder[3] = 3;
5131 }
5132 else if (byteorder == 1) {
5133 /* force BE */
5134 iorder[0] = 3;
5135 iorder[1] = 2;
5136 iorder[2] = 1;
5137 iorder[3] = 0;
5138 }
5139
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005140 for (i = 0; i < len; i++)
5141 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005142
5143 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005144 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005145#undef STORECHAR
5146}
5147
Alexander Belopolsky40018472011-02-26 01:02:56 +00005148PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005149PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5150 Py_ssize_t size,
5151 const char *errors,
5152 int byteorder)
5153{
5154 PyObject *result;
5155 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5156 if (tmp == NULL)
5157 return NULL;
5158 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5159 Py_DECREF(tmp);
5160 return result;
5161}
5162
5163PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005164PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005165{
Victor Stinnerb960b342011-11-20 19:12:52 +01005166 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005167}
5168
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169/* --- UTF-16 Codec ------------------------------------------------------- */
5170
Tim Peters772747b2001-08-09 22:21:55 +00005171PyObject *
5172PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 Py_ssize_t size,
5174 const char *errors,
5175 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176{
Walter Dörwald69652032004-09-07 20:24:22 +00005177 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5178}
5179
5180PyObject *
5181PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 Py_ssize_t size,
5183 const char *errors,
5184 int *byteorder,
5185 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005188 Py_ssize_t startinpos;
5189 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005190 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005191 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005192 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005193 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005194 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005195 PyObject *errorHandler = NULL;
5196 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197
Tim Peters772747b2001-08-09 22:21:55 +00005198 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005199 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200
5201 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005202 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005204 /* Check for BOM marks (U+FEFF) in the input and adjust current
5205 byte order setting accordingly. In native mode, the leading BOM
5206 mark is skipped, in all other modes, it is copied to the output
5207 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005208 if (bo == 0 && size >= 2) {
5209 const Py_UCS4 bom = (q[1] << 8) | q[0];
5210 if (bom == 0xFEFF) {
5211 q += 2;
5212 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005214 else if (bom == 0xFFFE) {
5215 q += 2;
5216 bo = 1;
5217 }
5218 if (byteorder)
5219 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221
Antoine Pitrou63065d72012-05-15 23:48:04 +02005222 if (q == e) {
5223 if (consumed)
5224 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005225 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005226 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005227
Christian Heimes743e0cd2012-10-17 23:52:17 +02005228#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005229 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005230#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005231 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005232#endif
Tim Peters772747b2001-08-09 22:21:55 +00005233
Antoine Pitrou63065d72012-05-15 23:48:04 +02005234 /* Note: size will always be longer than the resulting Unicode
5235 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005236 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005237 writer.min_length = (e - q + 1) / 2;
5238 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005239 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005240
Antoine Pitrou63065d72012-05-15 23:48:04 +02005241 while (1) {
5242 Py_UCS4 ch = 0;
5243 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005245 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005246 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005247 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005248 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005249 native_ordering);
5250 else
5251 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005252 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005253 native_ordering);
5254 } else if (kind == PyUnicode_2BYTE_KIND) {
5255 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005256 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005257 native_ordering);
5258 } else {
5259 assert(kind == PyUnicode_4BYTE_KIND);
5260 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005261 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005262 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005263 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005264 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005265
Antoine Pitrou63065d72012-05-15 23:48:04 +02005266 switch (ch)
5267 {
5268 case 0:
5269 /* remaining byte at the end? (size should be even) */
5270 if (q == e || consumed)
5271 goto End;
5272 errmsg = "truncated data";
5273 startinpos = ((const char *)q) - starts;
5274 endinpos = ((const char *)e) - starts;
5275 break;
5276 /* The remaining input chars are ignored if the callback
5277 chooses to skip the input */
5278 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005279 q -= 2;
5280 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005281 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005282 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005283 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005284 endinpos = ((const char *)e) - starts;
5285 break;
5286 case 2:
5287 errmsg = "illegal encoding";
5288 startinpos = ((const char *)q) - 2 - starts;
5289 endinpos = startinpos + 2;
5290 break;
5291 case 3:
5292 errmsg = "illegal UTF-16 surrogate";
5293 startinpos = ((const char *)q) - 4 - starts;
5294 endinpos = startinpos + 2;
5295 break;
5296 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005297 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005298 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 continue;
5300 }
5301
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005303 errors,
5304 &errorHandler,
5305 "utf16", errmsg,
5306 &starts,
5307 (const char **)&e,
5308 &startinpos,
5309 &endinpos,
5310 &exc,
5311 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005313 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 }
5315
Antoine Pitrou63065d72012-05-15 23:48:04 +02005316End:
Walter Dörwald69652032004-09-07 20:24:22 +00005317 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320 Py_XDECREF(errorHandler);
5321 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005322 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005325 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005326 Py_XDECREF(errorHandler);
5327 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 return NULL;
5329}
5330
Tim Peters772747b2001-08-09 22:21:55 +00005331PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005332_PyUnicode_EncodeUTF16(PyObject *str,
5333 const char *errors,
5334 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005336 enum PyUnicode_Kind kind;
5337 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005338 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005339 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005340 unsigned short *out;
5341 Py_ssize_t bytesize;
5342 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005343#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005344 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005345#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005346 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005347#endif
5348
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005349 if (!PyUnicode_Check(str)) {
5350 PyErr_BadArgument();
5351 return NULL;
5352 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005353 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005354 return NULL;
5355 kind = PyUnicode_KIND(str);
5356 data = PyUnicode_DATA(str);
5357 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005358
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005359 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005360 if (kind == PyUnicode_4BYTE_KIND) {
5361 const Py_UCS4 *in = (const Py_UCS4 *)data;
5362 const Py_UCS4 *end = in + len;
5363 while (in < end)
5364 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005365 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005366 }
5367 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005369 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005370 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 if (v == NULL)
5372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005374 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005375 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005376 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005378 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005380 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005381
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005382 switch (kind) {
5383 case PyUnicode_1BYTE_KIND: {
5384 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5385 break;
Tim Peters772747b2001-08-09 22:21:55 +00005386 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005387 case PyUnicode_2BYTE_KIND: {
5388 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5389 break;
Tim Peters772747b2001-08-09 22:21:55 +00005390 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005391 case PyUnicode_4BYTE_KIND: {
5392 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5393 break;
5394 }
5395 default:
5396 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005397 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005398
5399 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005400 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401}
5402
Alexander Belopolsky40018472011-02-26 01:02:56 +00005403PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005404PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5405 Py_ssize_t size,
5406 const char *errors,
5407 int byteorder)
5408{
5409 PyObject *result;
5410 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5411 if (tmp == NULL)
5412 return NULL;
5413 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5414 Py_DECREF(tmp);
5415 return result;
5416}
5417
5418PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005419PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005421 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422}
5423
5424/* --- Unicode Escape Codec ----------------------------------------------- */
5425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5427 if all the escapes in the string make it still a valid ASCII string.
5428 Returns -1 if any escapes were found which cause the string to
5429 pop out of ASCII range. Otherwise returns the length of the
5430 required buffer to hold the string.
5431 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005432static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5434{
5435 const unsigned char *p = (const unsigned char *)s;
5436 const unsigned char *end = p + size;
5437 Py_ssize_t length = 0;
5438
5439 if (size < 0)
5440 return -1;
5441
5442 for (; p < end; ++p) {
5443 if (*p > 127) {
5444 /* Non-ASCII */
5445 return -1;
5446 }
5447 else if (*p != '\\') {
5448 /* Normal character */
5449 ++length;
5450 }
5451 else {
5452 /* Backslash-escape, check next char */
5453 ++p;
5454 /* Escape sequence reaches till end of string or
5455 non-ASCII follow-up. */
5456 if (p >= end || *p > 127)
5457 return -1;
5458 switch (*p) {
5459 case '\n':
5460 /* backslash + \n result in zero characters */
5461 break;
5462 case '\\': case '\'': case '\"':
5463 case 'b': case 'f': case 't':
5464 case 'n': case 'r': case 'v': case 'a':
5465 ++length;
5466 break;
5467 case '0': case '1': case '2': case '3':
5468 case '4': case '5': case '6': case '7':
5469 case 'x': case 'u': case 'U': case 'N':
5470 /* these do not guarantee ASCII characters */
5471 return -1;
5472 default:
5473 /* count the backslash + the other character */
5474 length += 2;
5475 }
5476 }
5477 }
5478 return length;
5479}
5480
Fredrik Lundh06d12682001-01-24 07:59:11 +00005481static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005482
Alexander Belopolsky40018472011-02-26 01:02:56 +00005483PyObject *
5484PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005485 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005486 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005489 Py_ssize_t startinpos;
5490 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005491 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005493 char* message;
5494 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005495 PyObject *errorHandler = NULL;
5496 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005497 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005498
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005499 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005500 if (len == 0)
5501 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005502
5503 /* After length_of_escaped_ascii_string() there are two alternatives,
5504 either the string is pure ASCII with named escapes like \n, etc.
5505 and we determined it's exact size (common case)
5506 or it contains \x, \u, ... escape sequences. then we create a
5507 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005508 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005509 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005510 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005511 }
5512 else {
5513 /* Escaped strings will always be longer than the resulting
5514 Unicode string, so we start with size here and then reduce the
5515 length after conversion to the true value.
5516 (but if the error callback returns a long replacement string
5517 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005518 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005519 }
5520
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005522 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 while (s < end) {
5526 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005527 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529
5530 /* Non-escape characters are interpreted as Unicode ordinals */
5531 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005532 x = (unsigned char)*s;
5533 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005534 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005535 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 continue;
5537 }
5538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005539 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 /* \ - Escapes */
5541 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005542 c = *s++;
5543 if (s > end)
5544 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005545
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005546 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005549#define WRITECHAR(ch) \
5550 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005551 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005552 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005553 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005554
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005556 case '\\': WRITECHAR('\\'); break;
5557 case '\'': WRITECHAR('\''); break;
5558 case '\"': WRITECHAR('\"'); break;
5559 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005560 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005561 case 'f': WRITECHAR('\014'); break;
5562 case 't': WRITECHAR('\t'); break;
5563 case 'n': WRITECHAR('\n'); break;
5564 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005565 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005566 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005567 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005568 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 case '0': case '1': case '2': case '3':
5572 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005573 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005574 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005575 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005576 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005577 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005579 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 break;
5581
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 /* hex escapes */
5583 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005585 digits = 2;
5586 message = "truncated \\xXX escape";
5587 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005591 digits = 4;
5592 message = "truncated \\uXXXX escape";
5593 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005596 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005597 digits = 8;
5598 message = "truncated \\UXXXXXXXX escape";
5599 hexescape:
5600 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005601 if (end - s < digits) {
5602 /* count only hex digits */
5603 for (; s < end; ++s) {
5604 c = (unsigned char)*s;
5605 if (!Py_ISXDIGIT(c))
5606 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005607 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005608 goto error;
5609 }
5610 for (; digits--; ++s) {
5611 c = (unsigned char)*s;
5612 if (!Py_ISXDIGIT(c))
5613 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005614 chr = (chr<<4) & ~0xF;
5615 if (c >= '0' && c <= '9')
5616 chr += c - '0';
5617 else if (c >= 'a' && c <= 'f')
5618 chr += 10 + c - 'a';
5619 else
5620 chr += 10 + c - 'A';
5621 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005622 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 /* _decoding_error will have already written into the
5624 target buffer. */
5625 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005626 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005627 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005628 message = "illegal Unicode character";
5629 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005630 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005631 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005632 break;
5633
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005635 case 'N':
5636 message = "malformed \\N character escape";
5637 if (ucnhash_CAPI == NULL) {
5638 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005639 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5640 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005641 if (ucnhash_CAPI == NULL)
5642 goto ucnhashError;
5643 }
5644 if (*s == '{') {
5645 const char *start = s+1;
5646 /* look for the closing brace */
5647 while (*s != '}' && s < end)
5648 s++;
5649 if (s > start && s < end && *s == '}') {
5650 /* found a name. look it up in the unicode database */
5651 message = "unknown Unicode character name";
5652 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005653 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005654 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005655 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005656 goto store;
5657 }
5658 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005659 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005660
5661 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005662 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 message = "\\ at end of string";
5664 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005665 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005666 }
5667 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005668 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005669 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005670 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005671 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005673 continue;
5674
5675 error:
5676 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005677 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005678 errors, &errorHandler,
5679 "unicodeescape", message,
5680 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005681 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005682 goto onError;
5683 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005685#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005687 Py_XDECREF(errorHandler);
5688 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005690
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005692 PyErr_SetString(
5693 PyExc_UnicodeError,
5694 "\\N escapes not supported (can't load unicodedata module)"
5695 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005696 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 Py_XDECREF(errorHandler);
5698 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005699 return NULL;
5700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005702 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 Py_XDECREF(errorHandler);
5704 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return NULL;
5706}
5707
5708/* Return a Unicode-Escape string version of the Unicode object.
5709
5710 If quotes is true, the string is enclosed in u"" or u'' quotes as
5711 appropriate.
5712
5713*/
5714
Alexander Belopolsky40018472011-02-26 01:02:56 +00005715PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005716PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005718 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005719 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005721 int kind;
5722 void *data;
5723 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
Ezio Melottie7f90372012-10-05 03:33:31 +03005725 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005726 escape.
5727
Ezio Melottie7f90372012-10-05 03:33:31 +03005728 For UCS1 strings it's '\xxx', 4 bytes per source character.
5729 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5730 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005731 */
5732
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005733 if (!PyUnicode_Check(unicode)) {
5734 PyErr_BadArgument();
5735 return NULL;
5736 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005737 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005738 return NULL;
5739 len = PyUnicode_GET_LENGTH(unicode);
5740 kind = PyUnicode_KIND(unicode);
5741 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005742 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005743 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5744 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5745 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5746 }
5747
5748 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005749 return PyBytes_FromStringAndSize(NULL, 0);
5750
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005751 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005753
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005754 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005756 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005757 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 if (repr == NULL)
5759 return NULL;
5760
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005761 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005763 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005764 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005765
Walter Dörwald79e913e2007-05-12 11:08:06 +00005766 /* Escape backslashes */
5767 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 *p++ = '\\';
5769 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005770 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005771 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005772
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005773 /* Map 21-bit characters to '\U00xxxxxx' */
5774 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005775 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005776 *p++ = '\\';
5777 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005778 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5779 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5780 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5781 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5782 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5783 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5784 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5785 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005787 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005788
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005790 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 *p++ = '\\';
5792 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005793 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5794 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5795 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5796 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005798
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005799 /* Map special whitespace to '\t', \n', '\r' */
5800 else if (ch == '\t') {
5801 *p++ = '\\';
5802 *p++ = 't';
5803 }
5804 else if (ch == '\n') {
5805 *p++ = '\\';
5806 *p++ = 'n';
5807 }
5808 else if (ch == '\r') {
5809 *p++ = '\\';
5810 *p++ = 'r';
5811 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005812
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005813 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005814 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005816 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005817 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5818 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005819 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005820
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 /* Copy everything else as-is */
5822 else
5823 *p++ = (char) ch;
5824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005826 assert(p - PyBytes_AS_STRING(repr) > 0);
5827 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5828 return NULL;
5829 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830}
5831
Alexander Belopolsky40018472011-02-26 01:02:56 +00005832PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005833PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5834 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005836 PyObject *result;
5837 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5838 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005840 result = PyUnicode_AsUnicodeEscapeString(tmp);
5841 Py_DECREF(tmp);
5842 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843}
5844
5845/* --- Raw Unicode Escape Codec ------------------------------------------- */
5846
Alexander Belopolsky40018472011-02-26 01:02:56 +00005847PyObject *
5848PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005849 Py_ssize_t size,
5850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005853 Py_ssize_t startinpos;
5854 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 const char *end;
5857 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005858 PyObject *errorHandler = NULL;
5859 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005860
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005861 if (size == 0)
5862 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005863
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 /* Escaped strings will always be longer than the resulting
5865 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 length after conversion to the true value. (But decoding error
5867 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005868 _PyUnicodeWriter_Init(&writer);
5869 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005870
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 end = s + size;
5872 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 unsigned char c;
5874 Py_UCS4 x;
5875 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005876 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 /* Non-escape characters are interpreted as Unicode ordinals */
5879 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005880 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005881 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005882 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005884 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 startinpos = s-starts;
5886
5887 /* \u-escapes are only interpreted iff the number of leading
5888 backslashes if odd */
5889 bs = s;
5890 for (;s < end;) {
5891 if (*s != '\\')
5892 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005893 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005894 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005895 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 }
5897 if (((s - bs) & 1) == 0 ||
5898 s >= end ||
5899 (*s != 'u' && *s != 'U')) {
5900 continue;
5901 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005902 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 count = *s=='u' ? 4 : 8;
5904 s++;
5905
5906 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 for (x = 0, i = 0; i < count; ++i, ++s) {
5908 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005909 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005911 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 errors, &errorHandler,
5913 "rawunicodeescape", "truncated \\uXXXX",
5914 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005915 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 goto onError;
5917 goto nextByte;
5918 }
5919 x = (x<<4) & ~0xF;
5920 if (c >= '0' && c <= '9')
5921 x += c - '0';
5922 else if (c >= 'a' && c <= 'f')
5923 x += 10 + c - 'a';
5924 else
5925 x += 10 + c - 'A';
5926 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005927 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005928 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005929 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005930 }
5931 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005932 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005933 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005934 errors, &errorHandler,
5935 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005937 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005939 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 nextByte:
5941 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943 Py_XDECREF(errorHandler);
5944 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005945 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005946
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005948 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949 Py_XDECREF(errorHandler);
5950 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 return NULL;
5952}
5953
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954
Alexander Belopolsky40018472011-02-26 01:02:56 +00005955PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005958 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 char *p;
5960 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 Py_ssize_t expandsize, pos;
5962 int kind;
5963 void *data;
5964 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005966 if (!PyUnicode_Check(unicode)) {
5967 PyErr_BadArgument();
5968 return NULL;
5969 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005970 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005971 return NULL;
5972 kind = PyUnicode_KIND(unicode);
5973 data = PyUnicode_DATA(unicode);
5974 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005975 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5976 bytes, and 1 byte characters 4. */
5977 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005978
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005979 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005981
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005982 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 if (repr == NULL)
5984 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005985 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005986 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005988 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005989 for (pos = 0; pos < len; pos++) {
5990 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 /* Map 32-bit characters to '\Uxxxxxxxx' */
5992 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005993 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005994 *p++ = '\\';
5995 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005996 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5997 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5998 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5999 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6000 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6001 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6002 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6003 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006004 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006006 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 *p++ = '\\';
6008 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006009 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6010 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6011 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6012 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 /* Copy everything else as-is */
6015 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 *p++ = (char) ch;
6017 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006018
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006019 assert(p > q);
6020 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006021 return NULL;
6022 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023}
6024
Alexander Belopolsky40018472011-02-26 01:02:56 +00006025PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006026PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6027 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006029 PyObject *result;
6030 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6031 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006032 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6034 Py_DECREF(tmp);
6035 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036}
6037
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006038/* --- Unicode Internal Codec ------------------------------------------- */
6039
Alexander Belopolsky40018472011-02-26 01:02:56 +00006040PyObject *
6041_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006042 Py_ssize_t size,
6043 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006044{
6045 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006046 Py_ssize_t startinpos;
6047 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006048 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006049 const char *end;
6050 const char *reason;
6051 PyObject *errorHandler = NULL;
6052 PyObject *exc = NULL;
6053
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006054 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006055 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006056 1))
6057 return NULL;
6058
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006059 if (size == 0)
6060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006061
Victor Stinner8f674cc2013-04-17 23:02:17 +02006062 _PyUnicodeWriter_Init(&writer);
6063 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6064 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006066 }
6067 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006068
Victor Stinner8f674cc2013-04-17 23:02:17 +02006069 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006070 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006071 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006072 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006073 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006074 endinpos = end-starts;
6075 reason = "truncated input";
6076 goto error;
6077 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006078 /* We copy the raw representation one byte at a time because the
6079 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006080 ((char *) &uch)[0] = s[0];
6081 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006082#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006083 ((char *) &uch)[2] = s[2];
6084 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006085#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006086 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006087#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006088 /* We have to sanity check the raw data, otherwise doom looms for
6089 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006090 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006091 endinpos = s - starts + Py_UNICODE_SIZE;
6092 reason = "illegal code point (> 0x10FFFF)";
6093 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006094 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006095#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006096 s += Py_UNICODE_SIZE;
6097#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006098 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006099 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006100 Py_UNICODE uch2;
6101 ((char *) &uch2)[0] = s[0];
6102 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006103 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006104 {
Victor Stinner551ac952011-11-29 22:58:13 +01006105 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006106 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006107 }
6108 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006109#endif
6110
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006111 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006112 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006113 continue;
6114
6115 error:
6116 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006117 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006118 errors, &errorHandler,
6119 "unicode_internal", reason,
6120 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006121 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006122 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006123 }
6124
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006125 Py_XDECREF(errorHandler);
6126 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006127 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006128
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006130 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006131 Py_XDECREF(errorHandler);
6132 Py_XDECREF(exc);
6133 return NULL;
6134}
6135
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136/* --- Latin-1 Codec ------------------------------------------------------ */
6137
Alexander Belopolsky40018472011-02-26 01:02:56 +00006138PyObject *
6139PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006140 Py_ssize_t size,
6141 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006144 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145}
6146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006148static void
6149make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006150 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006151 PyObject *unicode,
6152 Py_ssize_t startpos, Py_ssize_t endpos,
6153 const char *reason)
6154{
6155 if (*exceptionObject == NULL) {
6156 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006157 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006158 encoding, unicode, startpos, endpos, reason);
6159 }
6160 else {
6161 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6162 goto onError;
6163 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6164 goto onError;
6165 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6166 goto onError;
6167 return;
6168 onError:
6169 Py_DECREF(*exceptionObject);
6170 *exceptionObject = NULL;
6171 }
6172}
6173
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006175static void
6176raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006177 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006178 PyObject *unicode,
6179 Py_ssize_t startpos, Py_ssize_t endpos,
6180 const char *reason)
6181{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006182 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006183 encoding, unicode, startpos, endpos, reason);
6184 if (*exceptionObject != NULL)
6185 PyCodec_StrictErrors(*exceptionObject);
6186}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006187
6188/* error handling callback helper:
6189 build arguments, call the callback and check the arguments,
6190 put the result into newpos and return the replacement string, which
6191 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006192static PyObject *
6193unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006194 PyObject **errorHandler,
6195 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006196 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006197 Py_ssize_t startpos, Py_ssize_t endpos,
6198 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006200 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006201 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202 PyObject *restuple;
6203 PyObject *resunicode;
6204
6205 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209 }
6210
Benjamin Petersonbac79492012-01-14 13:34:47 -05006211 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006212 return NULL;
6213 len = PyUnicode_GET_LENGTH(unicode);
6214
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006215 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006216 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006219
6220 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006225 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 Py_DECREF(restuple);
6227 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006228 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006229 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 &resunicode, newpos)) {
6231 Py_DECREF(restuple);
6232 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006233 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006234 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6235 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6236 Py_DECREF(restuple);
6237 return NULL;
6238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006239 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006240 *newpos = len + *newpos;
6241 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6243 Py_DECREF(restuple);
6244 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006245 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246 Py_INCREF(resunicode);
6247 Py_DECREF(restuple);
6248 return resunicode;
6249}
6250
Alexander Belopolsky40018472011-02-26 01:02:56 +00006251static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006252unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006253 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006254 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006255{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006256 /* input state */
6257 Py_ssize_t pos=0, size;
6258 int kind;
6259 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006260 /* output object */
6261 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006262 /* pointer into the output */
6263 char *str;
6264 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006265 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006266 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6267 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006268 PyObject *errorHandler = NULL;
6269 PyObject *exc = NULL;
6270 /* the following variable is used for caching string comparisons
6271 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6272 int known_errorHandler = -1;
6273
Benjamin Petersonbac79492012-01-14 13:34:47 -05006274 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 return NULL;
6276 size = PyUnicode_GET_LENGTH(unicode);
6277 kind = PyUnicode_KIND(unicode);
6278 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 /* allocate enough for a simple encoding without
6280 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006281 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006282 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006283 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006284 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006285 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006286 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287 ressize = size;
6288
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006289 while (pos < size) {
6290 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 /* can we encode this? */
6293 if (c<limit) {
6294 /* no overflow check, because we know that the space is enough */
6295 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006296 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006297 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 Py_ssize_t requiredsize;
6300 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006301 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006303 Py_ssize_t collstart = pos;
6304 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006306 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 ++collend;
6308 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6309 if (known_errorHandler==-1) {
6310 if ((errors==NULL) || (!strcmp(errors, "strict")))
6311 known_errorHandler = 1;
6312 else if (!strcmp(errors, "replace"))
6313 known_errorHandler = 2;
6314 else if (!strcmp(errors, "ignore"))
6315 known_errorHandler = 3;
6316 else if (!strcmp(errors, "xmlcharrefreplace"))
6317 known_errorHandler = 4;
6318 else
6319 known_errorHandler = 0;
6320 }
6321 switch (known_errorHandler) {
6322 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006323 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 goto onError;
6325 case 2: /* replace */
6326 while (collstart++<collend)
6327 *str++ = '?'; /* fall through */
6328 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006329 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 break;
6331 case 4: /* xmlcharrefreplace */
6332 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006333 /* determine replacement size */
6334 for (i = collstart, repsize = 0; i < collend; ++i) {
6335 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6336 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006338 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006340 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006342 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006344 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006346 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006348 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006349 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006351 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006353 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 if (requiredsize > ressize) {
6355 if (requiredsize<2*ressize)
6356 requiredsize = 2*ressize;
6357 if (_PyBytes_Resize(&res, requiredsize))
6358 goto onError;
6359 str = PyBytes_AS_STRING(res) + respos;
6360 ressize = requiredsize;
6361 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006362 /* generate replacement */
6363 for (i = collstart; i < collend; ++i) {
6364 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006366 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 break;
6368 default:
6369 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006370 encoding, reason, unicode, &exc,
6371 collstart, collend, &newpos);
6372 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006373 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006375 if (PyBytes_Check(repunicode)) {
6376 /* Directly copy bytes result to output. */
6377 repsize = PyBytes_Size(repunicode);
6378 if (repsize > 1) {
6379 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006380 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006381 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6382 Py_DECREF(repunicode);
6383 goto onError;
6384 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006385 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006386 ressize += repsize-1;
6387 }
6388 memcpy(str, PyBytes_AsString(repunicode), repsize);
6389 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006390 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006391 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006392 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006393 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 /* need more space? (at least enough for what we
6395 have+the replacement+the rest of the string, so
6396 we won't have to check space for encodable characters) */
6397 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 repsize = PyUnicode_GET_LENGTH(repunicode);
6399 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 if (requiredsize > ressize) {
6401 if (requiredsize<2*ressize)
6402 requiredsize = 2*ressize;
6403 if (_PyBytes_Resize(&res, requiredsize)) {
6404 Py_DECREF(repunicode);
6405 goto onError;
6406 }
6407 str = PyBytes_AS_STRING(res) + respos;
6408 ressize = requiredsize;
6409 }
6410 /* check if there is anything unencodable in the replacement
6411 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 for (i = 0; repsize-->0; ++i, ++str) {
6413 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006415 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 Py_DECREF(repunicode);
6418 goto onError;
6419 }
6420 *str = (char)c;
6421 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006422 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006423 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006424 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006425 }
6426 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006427 /* Resize if we allocated to much */
6428 size = str - PyBytes_AS_STRING(res);
6429 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006430 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006431 if (_PyBytes_Resize(&res, size) < 0)
6432 goto onError;
6433 }
6434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006435 Py_XDECREF(errorHandler);
6436 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006437 return res;
6438
6439 onError:
6440 Py_XDECREF(res);
6441 Py_XDECREF(errorHandler);
6442 Py_XDECREF(exc);
6443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444}
6445
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006446/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006447PyObject *
6448PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006449 Py_ssize_t size,
6450 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452 PyObject *result;
6453 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6454 if (unicode == NULL)
6455 return NULL;
6456 result = unicode_encode_ucs1(unicode, errors, 256);
6457 Py_DECREF(unicode);
6458 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459}
6460
Alexander Belopolsky40018472011-02-26 01:02:56 +00006461PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006462_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463{
6464 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 PyErr_BadArgument();
6466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006468 if (PyUnicode_READY(unicode) == -1)
6469 return NULL;
6470 /* Fast path: if it is a one-byte string, construct
6471 bytes object directly. */
6472 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6473 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6474 PyUnicode_GET_LENGTH(unicode));
6475 /* Non-Latin-1 characters present. Defer to above function to
6476 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006477 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006478}
6479
6480PyObject*
6481PyUnicode_AsLatin1String(PyObject *unicode)
6482{
6483 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484}
6485
6486/* --- 7-bit ASCII Codec -------------------------------------------------- */
6487
Alexander Belopolsky40018472011-02-26 01:02:56 +00006488PyObject *
6489PyUnicode_DecodeASCII(const char *s,
6490 Py_ssize_t size,
6491 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006493 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006494 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006495 int kind;
6496 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006497 Py_ssize_t startinpos;
6498 Py_ssize_t endinpos;
6499 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006500 const char *e;
6501 PyObject *errorHandler = NULL;
6502 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006503
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006505 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006506
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006508 if (size == 1 && (unsigned char)s[0] < 128)
6509 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006510
Victor Stinner8f674cc2013-04-17 23:02:17 +02006511 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006512 writer.min_length = size;
6513 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006514 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006515
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006517 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006518 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006519 writer.pos = outpos;
6520 if (writer.pos == size)
6521 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006522
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006523 s += writer.pos;
6524 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006525 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006526 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006528 PyUnicode_WRITE(kind, data, writer.pos, c);
6529 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 ++s;
6531 }
6532 else {
6533 startinpos = s-starts;
6534 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006535 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 errors, &errorHandler,
6537 "ascii", "ordinal not in range(128)",
6538 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006539 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006541 kind = writer.kind;
6542 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006545 Py_XDECREF(errorHandler);
6546 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006547 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006548
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006550 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006551 Py_XDECREF(errorHandler);
6552 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 return NULL;
6554}
6555
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006557PyObject *
6558PyUnicode_EncodeASCII(const Py_UNICODE *p,
6559 Py_ssize_t size,
6560 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006562 PyObject *result;
6563 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6564 if (unicode == NULL)
6565 return NULL;
6566 result = unicode_encode_ucs1(unicode, errors, 128);
6567 Py_DECREF(unicode);
6568 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569}
6570
Alexander Belopolsky40018472011-02-26 01:02:56 +00006571PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006572_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573{
6574 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 PyErr_BadArgument();
6576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006578 if (PyUnicode_READY(unicode) == -1)
6579 return NULL;
6580 /* Fast path: if it is an ASCII-only string, construct bytes object
6581 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006582 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006583 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6584 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006585 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006586}
6587
6588PyObject *
6589PyUnicode_AsASCIIString(PyObject *unicode)
6590{
6591 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592}
6593
Victor Stinner99b95382011-07-04 14:23:54 +02006594#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006595
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006596/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006597
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006598#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006599#define NEED_RETRY
6600#endif
6601
Victor Stinner3a50e702011-10-18 21:21:00 +02006602#ifndef WC_ERR_INVALID_CHARS
6603# define WC_ERR_INVALID_CHARS 0x0080
6604#endif
6605
6606static char*
6607code_page_name(UINT code_page, PyObject **obj)
6608{
6609 *obj = NULL;
6610 if (code_page == CP_ACP)
6611 return "mbcs";
6612 if (code_page == CP_UTF7)
6613 return "CP_UTF7";
6614 if (code_page == CP_UTF8)
6615 return "CP_UTF8";
6616
6617 *obj = PyBytes_FromFormat("cp%u", code_page);
6618 if (*obj == NULL)
6619 return NULL;
6620 return PyBytes_AS_STRING(*obj);
6621}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006622
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006624is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006625{
6626 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006627 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006628
Victor Stinner3a50e702011-10-18 21:21:00 +02006629 if (!IsDBCSLeadByteEx(code_page, *curr))
6630 return 0;
6631
6632 prev = CharPrevExA(code_page, s, curr, 0);
6633 if (prev == curr)
6634 return 1;
6635 /* FIXME: This code is limited to "true" double-byte encodings,
6636 as it assumes an incomplete character consists of a single
6637 byte. */
6638 if (curr - prev == 2)
6639 return 1;
6640 if (!IsDBCSLeadByteEx(code_page, *prev))
6641 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006642 return 0;
6643}
6644
Victor Stinner3a50e702011-10-18 21:21:00 +02006645static DWORD
6646decode_code_page_flags(UINT code_page)
6647{
6648 if (code_page == CP_UTF7) {
6649 /* The CP_UTF7 decoder only supports flags=0 */
6650 return 0;
6651 }
6652 else
6653 return MB_ERR_INVALID_CHARS;
6654}
6655
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006656/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006657 * Decode a byte string from a Windows code page into unicode object in strict
6658 * mode.
6659 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006660 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6661 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006662 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006663static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006664decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006665 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006666 const char *in,
6667 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006668{
Victor Stinner3a50e702011-10-18 21:21:00 +02006669 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006670 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006671 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006672
6673 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006674 assert(insize > 0);
6675 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6676 if (outsize <= 0)
6677 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006678
6679 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006681 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006682 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 if (*v == NULL)
6684 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006685 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006686 }
6687 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006689 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006690 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006692 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006693 }
6694
6695 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006696 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6697 if (outsize <= 0)
6698 goto error;
6699 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006700
Victor Stinner3a50e702011-10-18 21:21:00 +02006701error:
6702 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6703 return -2;
6704 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006705 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706}
6707
Victor Stinner3a50e702011-10-18 21:21:00 +02006708/*
6709 * Decode a byte string from a code page into unicode object with an error
6710 * handler.
6711 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006712 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006713 * UnicodeDecodeError exception and returns -1 on error.
6714 */
6715static int
6716decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006717 PyObject **v,
6718 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006719 const char *errors)
6720{
6721 const char *startin = in;
6722 const char *endin = in + size;
6723 const DWORD flags = decode_code_page_flags(code_page);
6724 /* Ideally, we should get reason from FormatMessage. This is the Windows
6725 2000 English version of the message. */
6726 const char *reason = "No mapping for the Unicode character exists "
6727 "in the target code page.";
6728 /* each step cannot decode more than 1 character, but a character can be
6729 represented as a surrogate pair */
6730 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006731 int insize;
6732 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006733 PyObject *errorHandler = NULL;
6734 PyObject *exc = NULL;
6735 PyObject *encoding_obj = NULL;
6736 char *encoding;
6737 DWORD err;
6738 int ret = -1;
6739
6740 assert(size > 0);
6741
6742 encoding = code_page_name(code_page, &encoding_obj);
6743 if (encoding == NULL)
6744 return -1;
6745
6746 if (errors == NULL || strcmp(errors, "strict") == 0) {
6747 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6748 UnicodeDecodeError. */
6749 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6750 if (exc != NULL) {
6751 PyCodec_StrictErrors(exc);
6752 Py_CLEAR(exc);
6753 }
6754 goto error;
6755 }
6756
6757 if (*v == NULL) {
6758 /* Create unicode object */
6759 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6760 PyErr_NoMemory();
6761 goto error;
6762 }
Victor Stinnerab595942011-12-17 04:59:06 +01006763 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006764 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006765 if (*v == NULL)
6766 goto error;
6767 startout = PyUnicode_AS_UNICODE(*v);
6768 }
6769 else {
6770 /* Extend unicode object */
6771 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6772 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6773 PyErr_NoMemory();
6774 goto error;
6775 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006776 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006777 goto error;
6778 startout = PyUnicode_AS_UNICODE(*v) + n;
6779 }
6780
6781 /* Decode the byte string character per character */
6782 out = startout;
6783 while (in < endin)
6784 {
6785 /* Decode a character */
6786 insize = 1;
6787 do
6788 {
6789 outsize = MultiByteToWideChar(code_page, flags,
6790 in, insize,
6791 buffer, Py_ARRAY_LENGTH(buffer));
6792 if (outsize > 0)
6793 break;
6794 err = GetLastError();
6795 if (err != ERROR_NO_UNICODE_TRANSLATION
6796 && err != ERROR_INSUFFICIENT_BUFFER)
6797 {
6798 PyErr_SetFromWindowsErr(0);
6799 goto error;
6800 }
6801 insize++;
6802 }
6803 /* 4=maximum length of a UTF-8 sequence */
6804 while (insize <= 4 && (in + insize) <= endin);
6805
6806 if (outsize <= 0) {
6807 Py_ssize_t startinpos, endinpos, outpos;
6808
6809 startinpos = in - startin;
6810 endinpos = startinpos + 1;
6811 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006812 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006813 errors, &errorHandler,
6814 encoding, reason,
6815 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006816 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006817 {
6818 goto error;
6819 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006820 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006821 }
6822 else {
6823 in += insize;
6824 memcpy(out, buffer, outsize * sizeof(wchar_t));
6825 out += outsize;
6826 }
6827 }
6828
6829 /* write a NUL character at the end */
6830 *out = 0;
6831
6832 /* Extend unicode object */
6833 outsize = out - startout;
6834 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006835 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006836 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006837 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006838
6839error:
6840 Py_XDECREF(encoding_obj);
6841 Py_XDECREF(errorHandler);
6842 Py_XDECREF(exc);
6843 return ret;
6844}
6845
Victor Stinner3a50e702011-10-18 21:21:00 +02006846static PyObject *
6847decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006848 const char *s, Py_ssize_t size,
6849 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850{
Victor Stinner76a31a62011-11-04 00:05:13 +01006851 PyObject *v = NULL;
6852 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
Victor Stinner3a50e702011-10-18 21:21:00 +02006854 if (code_page < 0) {
6855 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6856 return NULL;
6857 }
6858
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006859 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861
Victor Stinner76a31a62011-11-04 00:05:13 +01006862 do
6863 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006864#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006865 if (size > INT_MAX) {
6866 chunk_size = INT_MAX;
6867 final = 0;
6868 done = 0;
6869 }
6870 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006872 {
6873 chunk_size = (int)size;
6874 final = (consumed == NULL);
6875 done = 1;
6876 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006877
Victor Stinner76a31a62011-11-04 00:05:13 +01006878 /* Skip trailing lead-byte unless 'final' is set */
6879 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6880 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881
Victor Stinner76a31a62011-11-04 00:05:13 +01006882 if (chunk_size == 0 && done) {
6883 if (v != NULL)
6884 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006885 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006886 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887
Victor Stinner76a31a62011-11-04 00:05:13 +01006888
6889 converted = decode_code_page_strict(code_page, &v,
6890 s, chunk_size);
6891 if (converted == -2)
6892 converted = decode_code_page_errors(code_page, &v,
6893 s, chunk_size,
6894 errors);
6895 assert(converted != 0);
6896
6897 if (converted < 0) {
6898 Py_XDECREF(v);
6899 return NULL;
6900 }
6901
6902 if (consumed)
6903 *consumed += converted;
6904
6905 s += converted;
6906 size -= converted;
6907 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006908
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006909 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006910}
6911
Alexander Belopolsky40018472011-02-26 01:02:56 +00006912PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006913PyUnicode_DecodeCodePageStateful(int code_page,
6914 const char *s,
6915 Py_ssize_t size,
6916 const char *errors,
6917 Py_ssize_t *consumed)
6918{
6919 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6920}
6921
6922PyObject *
6923PyUnicode_DecodeMBCSStateful(const char *s,
6924 Py_ssize_t size,
6925 const char *errors,
6926 Py_ssize_t *consumed)
6927{
6928 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6929}
6930
6931PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006932PyUnicode_DecodeMBCS(const char *s,
6933 Py_ssize_t size,
6934 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006935{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006936 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6937}
6938
Victor Stinner3a50e702011-10-18 21:21:00 +02006939static DWORD
6940encode_code_page_flags(UINT code_page, const char *errors)
6941{
6942 if (code_page == CP_UTF8) {
6943 if (winver.dwMajorVersion >= 6)
6944 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6945 and later */
6946 return WC_ERR_INVALID_CHARS;
6947 else
6948 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6949 return 0;
6950 }
6951 else if (code_page == CP_UTF7) {
6952 /* CP_UTF7 only supports flags=0 */
6953 return 0;
6954 }
6955 else {
6956 if (errors != NULL && strcmp(errors, "replace") == 0)
6957 return 0;
6958 else
6959 return WC_NO_BEST_FIT_CHARS;
6960 }
6961}
6962
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006964 * Encode a Unicode string to a Windows code page into a byte string in strict
6965 * mode.
6966 *
6967 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006968 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006970static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006971encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006972 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974{
Victor Stinner554f3f02010-06-16 23:33:54 +00006975 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006976 BOOL *pusedDefaultChar = &usedDefaultChar;
6977 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006978 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006979 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006980 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 const DWORD flags = encode_code_page_flags(code_page, NULL);
6982 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006983 /* Create a substring so that we can get the UTF-16 representation
6984 of just the slice under consideration. */
6985 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986
Martin v. Löwis3d325192011-11-04 18:23:06 +01006987 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006988
Victor Stinner3a50e702011-10-18 21:21:00 +02006989 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006990 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006991 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006992 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006993
Victor Stinner2fc507f2011-11-04 20:06:39 +01006994 substring = PyUnicode_Substring(unicode, offset, offset+len);
6995 if (substring == NULL)
6996 return -1;
6997 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6998 if (p == NULL) {
6999 Py_DECREF(substring);
7000 return -1;
7001 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007002 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007003
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007004 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007006 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007007 NULL, 0,
7008 NULL, pusedDefaultChar);
7009 if (outsize <= 0)
7010 goto error;
7011 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007012 if (pusedDefaultChar && *pusedDefaultChar) {
7013 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007015 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007016
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007019 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007020 if (*outbytes == NULL) {
7021 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007023 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007024 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007025 }
7026 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007027 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 const Py_ssize_t n = PyBytes_Size(*outbytes);
7029 if (outsize > PY_SSIZE_T_MAX - n) {
7030 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007031 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007033 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007034 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7035 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007036 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007037 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007038 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007039 }
7040
7041 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007043 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 out, outsize,
7045 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007046 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007047 if (outsize <= 0)
7048 goto error;
7049 if (pusedDefaultChar && *pusedDefaultChar)
7050 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007052
Victor Stinner3a50e702011-10-18 21:21:00 +02007053error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007054 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007055 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7056 return -2;
7057 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007058 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007059}
7060
Victor Stinner3a50e702011-10-18 21:21:00 +02007061/*
7062 * Encode a Unicode string to a Windows code page into a byte string using a
7063 * error handler.
7064 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007065 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007066 * -1 on other error.
7067 */
7068static int
7069encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007070 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007071 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007072{
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007074 Py_ssize_t pos = unicode_offset;
7075 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007076 /* Ideally, we should get reason from FormatMessage. This is the Windows
7077 2000 English version of the message. */
7078 const char *reason = "invalid character";
7079 /* 4=maximum length of a UTF-8 sequence */
7080 char buffer[4];
7081 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7082 Py_ssize_t outsize;
7083 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007084 PyObject *errorHandler = NULL;
7085 PyObject *exc = NULL;
7086 PyObject *encoding_obj = NULL;
7087 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007088 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 PyObject *rep;
7090 int ret = -1;
7091
7092 assert(insize > 0);
7093
7094 encoding = code_page_name(code_page, &encoding_obj);
7095 if (encoding == NULL)
7096 return -1;
7097
7098 if (errors == NULL || strcmp(errors, "strict") == 0) {
7099 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7100 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007101 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 if (exc != NULL) {
7103 PyCodec_StrictErrors(exc);
7104 Py_DECREF(exc);
7105 }
7106 Py_XDECREF(encoding_obj);
7107 return -1;
7108 }
7109
7110 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7111 pusedDefaultChar = &usedDefaultChar;
7112 else
7113 pusedDefaultChar = NULL;
7114
7115 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7116 PyErr_NoMemory();
7117 goto error;
7118 }
7119 outsize = insize * Py_ARRAY_LENGTH(buffer);
7120
7121 if (*outbytes == NULL) {
7122 /* Create string object */
7123 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7124 if (*outbytes == NULL)
7125 goto error;
7126 out = PyBytes_AS_STRING(*outbytes);
7127 }
7128 else {
7129 /* Extend string object */
7130 Py_ssize_t n = PyBytes_Size(*outbytes);
7131 if (n > PY_SSIZE_T_MAX - outsize) {
7132 PyErr_NoMemory();
7133 goto error;
7134 }
7135 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7136 goto error;
7137 out = PyBytes_AS_STRING(*outbytes) + n;
7138 }
7139
7140 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007141 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007143 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7144 wchar_t chars[2];
7145 int charsize;
7146 if (ch < 0x10000) {
7147 chars[0] = (wchar_t)ch;
7148 charsize = 1;
7149 }
7150 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007151 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7152 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007153 charsize = 2;
7154 }
7155
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007157 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 buffer, Py_ARRAY_LENGTH(buffer),
7159 NULL, pusedDefaultChar);
7160 if (outsize > 0) {
7161 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7162 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007163 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 memcpy(out, buffer, outsize);
7165 out += outsize;
7166 continue;
7167 }
7168 }
7169 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7170 PyErr_SetFromWindowsErr(0);
7171 goto error;
7172 }
7173
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 rep = unicode_encode_call_errorhandler(
7175 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007176 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007177 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 if (rep == NULL)
7179 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007180 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007181
7182 if (PyBytes_Check(rep)) {
7183 outsize = PyBytes_GET_SIZE(rep);
7184 if (outsize != 1) {
7185 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7186 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7187 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7188 Py_DECREF(rep);
7189 goto error;
7190 }
7191 out = PyBytes_AS_STRING(*outbytes) + offset;
7192 }
7193 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7194 out += outsize;
7195 }
7196 else {
7197 Py_ssize_t i;
7198 enum PyUnicode_Kind kind;
7199 void *data;
7200
Benjamin Petersonbac79492012-01-14 13:34:47 -05007201 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 Py_DECREF(rep);
7203 goto error;
7204 }
7205
7206 outsize = PyUnicode_GET_LENGTH(rep);
7207 if (outsize != 1) {
7208 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7209 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7210 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7211 Py_DECREF(rep);
7212 goto error;
7213 }
7214 out = PyBytes_AS_STRING(*outbytes) + offset;
7215 }
7216 kind = PyUnicode_KIND(rep);
7217 data = PyUnicode_DATA(rep);
7218 for (i=0; i < outsize; i++) {
7219 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7220 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007221 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007222 encoding, unicode,
7223 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 "unable to encode error handler result to ASCII");
7225 Py_DECREF(rep);
7226 goto error;
7227 }
7228 *out = (unsigned char)ch;
7229 out++;
7230 }
7231 }
7232 Py_DECREF(rep);
7233 }
7234 /* write a NUL byte */
7235 *out = 0;
7236 outsize = out - PyBytes_AS_STRING(*outbytes);
7237 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7238 if (_PyBytes_Resize(outbytes, outsize) < 0)
7239 goto error;
7240 ret = 0;
7241
7242error:
7243 Py_XDECREF(encoding_obj);
7244 Py_XDECREF(errorHandler);
7245 Py_XDECREF(exc);
7246 return ret;
7247}
7248
Victor Stinner3a50e702011-10-18 21:21:00 +02007249static PyObject *
7250encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007251 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 const char *errors)
7253{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007254 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007256 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007257 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007258
Benjamin Petersonbac79492012-01-14 13:34:47 -05007259 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007260 return NULL;
7261 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007262
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 if (code_page < 0) {
7264 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7265 return NULL;
7266 }
7267
Martin v. Löwis3d325192011-11-04 18:23:06 +01007268 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007269 return PyBytes_FromStringAndSize(NULL, 0);
7270
Victor Stinner7581cef2011-11-03 22:32:33 +01007271 offset = 0;
7272 do
7273 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007274#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007275 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007276 chunks. */
7277 if (len > INT_MAX/2) {
7278 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007279 done = 0;
7280 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007281 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007282#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007283 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007284 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007285 done = 1;
7286 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007287
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007289 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007290 errors);
7291 if (ret == -2)
7292 ret = encode_code_page_errors(code_page, &outbytes,
7293 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007294 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007295 if (ret < 0) {
7296 Py_XDECREF(outbytes);
7297 return NULL;
7298 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007299
Victor Stinner7581cef2011-11-03 22:32:33 +01007300 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007301 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007302 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007303
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 return outbytes;
7305}
7306
7307PyObject *
7308PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7309 Py_ssize_t size,
7310 const char *errors)
7311{
Victor Stinner7581cef2011-11-03 22:32:33 +01007312 PyObject *unicode, *res;
7313 unicode = PyUnicode_FromUnicode(p, size);
7314 if (unicode == NULL)
7315 return NULL;
7316 res = encode_code_page(CP_ACP, unicode, errors);
7317 Py_DECREF(unicode);
7318 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007319}
7320
7321PyObject *
7322PyUnicode_EncodeCodePage(int code_page,
7323 PyObject *unicode,
7324 const char *errors)
7325{
Victor Stinner7581cef2011-11-03 22:32:33 +01007326 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007327}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007328
Alexander Belopolsky40018472011-02-26 01:02:56 +00007329PyObject *
7330PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007331{
7332 if (!PyUnicode_Check(unicode)) {
7333 PyErr_BadArgument();
7334 return NULL;
7335 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007336 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007337}
7338
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007339#undef NEED_RETRY
7340
Victor Stinner99b95382011-07-04 14:23:54 +02007341#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007342
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343/* --- Character Mapping Codec -------------------------------------------- */
7344
Victor Stinnerfb161b12013-04-18 01:44:27 +02007345static int
7346charmap_decode_string(const char *s,
7347 Py_ssize_t size,
7348 PyObject *mapping,
7349 const char *errors,
7350 _PyUnicodeWriter *writer)
7351{
7352 const char *starts = s;
7353 const char *e;
7354 Py_ssize_t startinpos, endinpos;
7355 PyObject *errorHandler = NULL, *exc = NULL;
7356 Py_ssize_t maplen;
7357 enum PyUnicode_Kind mapkind;
7358 void *mapdata;
7359 Py_UCS4 x;
7360 unsigned char ch;
7361
7362 if (PyUnicode_READY(mapping) == -1)
7363 return -1;
7364
7365 maplen = PyUnicode_GET_LENGTH(mapping);
7366 mapdata = PyUnicode_DATA(mapping);
7367 mapkind = PyUnicode_KIND(mapping);
7368
7369 e = s + size;
7370
7371 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7372 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7373 * is disabled in encoding aliases, latin1 is preferred because
7374 * its implementation is faster. */
7375 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7376 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7377 Py_UCS4 maxchar = writer->maxchar;
7378
7379 assert (writer->kind == PyUnicode_1BYTE_KIND);
7380 while (s < e) {
7381 ch = *s;
7382 x = mapdata_ucs1[ch];
7383 if (x > maxchar) {
7384 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7385 goto onError;
7386 maxchar = writer->maxchar;
7387 outdata = (Py_UCS1 *)writer->data;
7388 }
7389 outdata[writer->pos] = x;
7390 writer->pos++;
7391 ++s;
7392 }
7393 return 0;
7394 }
7395
7396 while (s < e) {
7397 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7398 enum PyUnicode_Kind outkind = writer->kind;
7399 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7400 if (outkind == PyUnicode_1BYTE_KIND) {
7401 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7402 Py_UCS4 maxchar = writer->maxchar;
7403 while (s < e) {
7404 ch = *s;
7405 x = mapdata_ucs2[ch];
7406 if (x > maxchar)
7407 goto Error;
7408 outdata[writer->pos] = x;
7409 writer->pos++;
7410 ++s;
7411 }
7412 break;
7413 }
7414 else if (outkind == PyUnicode_2BYTE_KIND) {
7415 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7416 while (s < e) {
7417 ch = *s;
7418 x = mapdata_ucs2[ch];
7419 if (x == 0xFFFE)
7420 goto Error;
7421 outdata[writer->pos] = x;
7422 writer->pos++;
7423 ++s;
7424 }
7425 break;
7426 }
7427 }
7428 ch = *s;
7429
7430 if (ch < maplen)
7431 x = PyUnicode_READ(mapkind, mapdata, ch);
7432 else
7433 x = 0xfffe; /* invalid value */
7434Error:
7435 if (x == 0xfffe)
7436 {
7437 /* undefined mapping */
7438 startinpos = s-starts;
7439 endinpos = startinpos+1;
7440 if (unicode_decode_call_errorhandler_writer(
7441 errors, &errorHandler,
7442 "charmap", "character maps to <undefined>",
7443 &starts, &e, &startinpos, &endinpos, &exc, &s,
7444 writer)) {
7445 goto onError;
7446 }
7447 continue;
7448 }
7449
7450 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7451 goto onError;
7452 ++s;
7453 }
7454 Py_XDECREF(errorHandler);
7455 Py_XDECREF(exc);
7456 return 0;
7457
7458onError:
7459 Py_XDECREF(errorHandler);
7460 Py_XDECREF(exc);
7461 return -1;
7462}
7463
7464static int
7465charmap_decode_mapping(const char *s,
7466 Py_ssize_t size,
7467 PyObject *mapping,
7468 const char *errors,
7469 _PyUnicodeWriter *writer)
7470{
7471 const char *starts = s;
7472 const char *e;
7473 Py_ssize_t startinpos, endinpos;
7474 PyObject *errorHandler = NULL, *exc = NULL;
7475 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007476 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007477
7478 e = s + size;
7479
7480 while (s < e) {
7481 ch = *s;
7482
7483 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7484 key = PyLong_FromLong((long)ch);
7485 if (key == NULL)
7486 goto onError;
7487
7488 item = PyObject_GetItem(mapping, key);
7489 Py_DECREF(key);
7490 if (item == NULL) {
7491 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7492 /* No mapping found means: mapping is undefined. */
7493 PyErr_Clear();
7494 goto Undefined;
7495 } else
7496 goto onError;
7497 }
7498
7499 /* Apply mapping */
7500 if (item == Py_None)
7501 goto Undefined;
7502 if (PyLong_Check(item)) {
7503 long value = PyLong_AS_LONG(item);
7504 if (value == 0xFFFE)
7505 goto Undefined;
7506 if (value < 0 || value > MAX_UNICODE) {
7507 PyErr_Format(PyExc_TypeError,
7508 "character mapping must be in range(0x%lx)",
7509 (unsigned long)MAX_UNICODE + 1);
7510 goto onError;
7511 }
7512
7513 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7514 goto onError;
7515 }
7516 else if (PyUnicode_Check(item)) {
7517 if (PyUnicode_READY(item) == -1)
7518 goto onError;
7519 if (PyUnicode_GET_LENGTH(item) == 1) {
7520 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7521 if (value == 0xFFFE)
7522 goto Undefined;
7523 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7524 goto onError;
7525 }
7526 else {
7527 writer->overallocate = 1;
7528 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7529 goto onError;
7530 }
7531 }
7532 else {
7533 /* wrong return value */
7534 PyErr_SetString(PyExc_TypeError,
7535 "character mapping must return integer, None or str");
7536 goto onError;
7537 }
7538 Py_CLEAR(item);
7539 ++s;
7540 continue;
7541
7542Undefined:
7543 /* undefined mapping */
7544 Py_CLEAR(item);
7545 startinpos = s-starts;
7546 endinpos = startinpos+1;
7547 if (unicode_decode_call_errorhandler_writer(
7548 errors, &errorHandler,
7549 "charmap", "character maps to <undefined>",
7550 &starts, &e, &startinpos, &endinpos, &exc, &s,
7551 writer)) {
7552 goto onError;
7553 }
7554 }
7555 Py_XDECREF(errorHandler);
7556 Py_XDECREF(exc);
7557 return 0;
7558
7559onError:
7560 Py_XDECREF(item);
7561 Py_XDECREF(errorHandler);
7562 Py_XDECREF(exc);
7563 return -1;
7564}
7565
Alexander Belopolsky40018472011-02-26 01:02:56 +00007566PyObject *
7567PyUnicode_DecodeCharmap(const char *s,
7568 Py_ssize_t size,
7569 PyObject *mapping,
7570 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007572 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007573
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 /* Default to Latin-1 */
7575 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007579 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007580 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007581 writer.min_length = size;
7582 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007584
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007585 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007586 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7587 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007588 }
7589 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007590 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7591 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007593 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007594
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007596 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 return NULL;
7598}
7599
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007600/* Charmap encoding: the lookup table */
7601
Alexander Belopolsky40018472011-02-26 01:02:56 +00007602struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 PyObject_HEAD
7604 unsigned char level1[32];
7605 int count2, count3;
7606 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007607};
7608
7609static PyObject*
7610encoding_map_size(PyObject *obj, PyObject* args)
7611{
7612 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007613 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007615}
7616
7617static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007618 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 PyDoc_STR("Return the size (in bytes) of this object") },
7620 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621};
7622
7623static void
7624encoding_map_dealloc(PyObject* o)
7625{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007627}
7628
7629static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007630 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 "EncodingMap", /*tp_name*/
7632 sizeof(struct encoding_map), /*tp_basicsize*/
7633 0, /*tp_itemsize*/
7634 /* methods */
7635 encoding_map_dealloc, /*tp_dealloc*/
7636 0, /*tp_print*/
7637 0, /*tp_getattr*/
7638 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007639 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 0, /*tp_repr*/
7641 0, /*tp_as_number*/
7642 0, /*tp_as_sequence*/
7643 0, /*tp_as_mapping*/
7644 0, /*tp_hash*/
7645 0, /*tp_call*/
7646 0, /*tp_str*/
7647 0, /*tp_getattro*/
7648 0, /*tp_setattro*/
7649 0, /*tp_as_buffer*/
7650 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7651 0, /*tp_doc*/
7652 0, /*tp_traverse*/
7653 0, /*tp_clear*/
7654 0, /*tp_richcompare*/
7655 0, /*tp_weaklistoffset*/
7656 0, /*tp_iter*/
7657 0, /*tp_iternext*/
7658 encoding_map_methods, /*tp_methods*/
7659 0, /*tp_members*/
7660 0, /*tp_getset*/
7661 0, /*tp_base*/
7662 0, /*tp_dict*/
7663 0, /*tp_descr_get*/
7664 0, /*tp_descr_set*/
7665 0, /*tp_dictoffset*/
7666 0, /*tp_init*/
7667 0, /*tp_alloc*/
7668 0, /*tp_new*/
7669 0, /*tp_free*/
7670 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007671};
7672
7673PyObject*
7674PyUnicode_BuildEncodingMap(PyObject* string)
7675{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007676 PyObject *result;
7677 struct encoding_map *mresult;
7678 int i;
7679 int need_dict = 0;
7680 unsigned char level1[32];
7681 unsigned char level2[512];
7682 unsigned char *mlevel1, *mlevel2, *mlevel3;
7683 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007684 int kind;
7685 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007686 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007687 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007688
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007689 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007690 PyErr_BadArgument();
7691 return NULL;
7692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007693 kind = PyUnicode_KIND(string);
7694 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007695 length = PyUnicode_GET_LENGTH(string);
7696 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007697 memset(level1, 0xFF, sizeof level1);
7698 memset(level2, 0xFF, sizeof level2);
7699
7700 /* If there isn't a one-to-one mapping of NULL to \0,
7701 or if there are non-BMP characters, we need to use
7702 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007705 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007706 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007707 ch = PyUnicode_READ(kind, data, i);
7708 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007709 need_dict = 1;
7710 break;
7711 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007712 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713 /* unmapped character */
7714 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007715 l1 = ch >> 11;
7716 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007717 if (level1[l1] == 0xFF)
7718 level1[l1] = count2++;
7719 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007720 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007721 }
7722
7723 if (count2 >= 0xFF || count3 >= 0xFF)
7724 need_dict = 1;
7725
7726 if (need_dict) {
7727 PyObject *result = PyDict_New();
7728 PyObject *key, *value;
7729 if (!result)
7730 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007731 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007732 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007733 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007734 if (!key || !value)
7735 goto failed1;
7736 if (PyDict_SetItem(result, key, value) == -1)
7737 goto failed1;
7738 Py_DECREF(key);
7739 Py_DECREF(value);
7740 }
7741 return result;
7742 failed1:
7743 Py_XDECREF(key);
7744 Py_XDECREF(value);
7745 Py_DECREF(result);
7746 return NULL;
7747 }
7748
7749 /* Create a three-level trie */
7750 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7751 16*count2 + 128*count3 - 1);
7752 if (!result)
7753 return PyErr_NoMemory();
7754 PyObject_Init(result, &EncodingMapType);
7755 mresult = (struct encoding_map*)result;
7756 mresult->count2 = count2;
7757 mresult->count3 = count3;
7758 mlevel1 = mresult->level1;
7759 mlevel2 = mresult->level23;
7760 mlevel3 = mresult->level23 + 16*count2;
7761 memcpy(mlevel1, level1, 32);
7762 memset(mlevel2, 0xFF, 16*count2);
7763 memset(mlevel3, 0, 128*count3);
7764 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007765 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007767 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7768 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769 /* unmapped character */
7770 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007771 o1 = ch>>11;
7772 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 i2 = 16*mlevel1[o1] + o2;
7774 if (mlevel2[i2] == 0xFF)
7775 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007776 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777 i3 = 128*mlevel2[i2] + o3;
7778 mlevel3[i3] = i;
7779 }
7780 return result;
7781}
7782
7783static int
Victor Stinner22168992011-11-20 17:09:18 +01007784encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007785{
7786 struct encoding_map *map = (struct encoding_map*)mapping;
7787 int l1 = c>>11;
7788 int l2 = (c>>7) & 0xF;
7789 int l3 = c & 0x7F;
7790 int i;
7791
Victor Stinner22168992011-11-20 17:09:18 +01007792 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007794 if (c == 0)
7795 return 0;
7796 /* level 1*/
7797 i = map->level1[l1];
7798 if (i == 0xFF) {
7799 return -1;
7800 }
7801 /* level 2*/
7802 i = map->level23[16*i+l2];
7803 if (i == 0xFF) {
7804 return -1;
7805 }
7806 /* level 3 */
7807 i = map->level23[16*map->count2 + 128*i + l3];
7808 if (i == 0) {
7809 return -1;
7810 }
7811 return i;
7812}
7813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814/* Lookup the character ch in the mapping. If the character
7815 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007816 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007818charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819{
Christian Heimes217cfd12007-12-02 14:31:20 +00007820 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 PyObject *x;
7822
7823 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007825 x = PyObject_GetItem(mapping, w);
7826 Py_DECREF(w);
7827 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7829 /* No mapping found means: mapping is undefined. */
7830 PyErr_Clear();
7831 x = Py_None;
7832 Py_INCREF(x);
7833 return x;
7834 } else
7835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007837 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007839 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 long value = PyLong_AS_LONG(x);
7841 if (value < 0 || value > 255) {
7842 PyErr_SetString(PyExc_TypeError,
7843 "character mapping must be in range(256)");
7844 Py_DECREF(x);
7845 return NULL;
7846 }
7847 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007849 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 /* wrong return value */
7853 PyErr_Format(PyExc_TypeError,
7854 "character mapping must return integer, bytes or None, not %.400s",
7855 x->ob_type->tp_name);
7856 Py_DECREF(x);
7857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 }
7859}
7860
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007862charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7865 /* exponentially overallocate to minimize reallocations */
7866 if (requiredsize < 2*outsize)
7867 requiredsize = 2*outsize;
7868 if (_PyBytes_Resize(outobj, requiredsize))
7869 return -1;
7870 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871}
7872
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007875} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007877 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007878 space is available. Return a new reference to the object that
7879 was put in the output buffer, or Py_None, if the mapping was undefined
7880 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007881 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007882static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007883charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007884 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886 PyObject *rep;
7887 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007888 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889
Christian Heimes90aa7642007-12-19 02:45:37 +00007890 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 if (res == -1)
7894 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 if (outsize<requiredsize)
7896 if (charmapencode_resize(outobj, outpos, requiredsize))
7897 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007898 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 outstart[(*outpos)++] = (char)res;
7900 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 }
7902
7903 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007904 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 Py_DECREF(rep);
7908 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 if (PyLong_Check(rep)) {
7911 Py_ssize_t requiredsize = *outpos+1;
7912 if (outsize<requiredsize)
7913 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7914 Py_DECREF(rep);
7915 return enc_EXCEPTION;
7916 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007917 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 else {
7921 const char *repchars = PyBytes_AS_STRING(rep);
7922 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7923 Py_ssize_t requiredsize = *outpos+repsize;
7924 if (outsize<requiredsize)
7925 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7926 Py_DECREF(rep);
7927 return enc_EXCEPTION;
7928 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007929 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 memcpy(outstart + *outpos, repchars, repsize);
7931 *outpos += repsize;
7932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934 Py_DECREF(rep);
7935 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936}
7937
7938/* handle an error in PyUnicode_EncodeCharmap
7939 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007940static int
7941charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007942 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007944 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007945 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946{
7947 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007948 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007949 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007950 enum PyUnicode_Kind kind;
7951 void *data;
7952 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007954 Py_ssize_t collstartpos = *inpos;
7955 Py_ssize_t collendpos = *inpos+1;
7956 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 char *encoding = "charmap";
7958 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007960 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007961 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962
Benjamin Petersonbac79492012-01-14 13:34:47 -05007963 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007964 return -1;
7965 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966 /* find all unencodable characters */
7967 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007969 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007970 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007971 val = encoding_map_lookup(ch, mapping);
7972 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 break;
7974 ++collendpos;
7975 continue;
7976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007977
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007978 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7979 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 if (rep==NULL)
7981 return -1;
7982 else if (rep!=Py_None) {
7983 Py_DECREF(rep);
7984 break;
7985 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988 }
7989 /* cache callback name lookup
7990 * (if not done yet, i.e. it's the first error) */
7991 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 if ((errors==NULL) || (!strcmp(errors, "strict")))
7993 *known_errorHandler = 1;
7994 else if (!strcmp(errors, "replace"))
7995 *known_errorHandler = 2;
7996 else if (!strcmp(errors, "ignore"))
7997 *known_errorHandler = 3;
7998 else if (!strcmp(errors, "xmlcharrefreplace"))
7999 *known_errorHandler = 4;
8000 else
8001 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008002 }
8003 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008005 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008006 return -1;
8007 case 2: /* replace */
8008 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 x = charmapencode_output('?', mapping, res, respos);
8010 if (x==enc_EXCEPTION) {
8011 return -1;
8012 }
8013 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008014 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return -1;
8016 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008017 }
8018 /* fall through */
8019 case 3: /* ignore */
8020 *inpos = collendpos;
8021 break;
8022 case 4: /* xmlcharrefreplace */
8023 /* generate replacement (temporarily (mis)uses p) */
8024 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 char buffer[2+29+1+1];
8026 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008027 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 for (cp = buffer; *cp; ++cp) {
8029 x = charmapencode_output(*cp, mapping, res, respos);
8030 if (x==enc_EXCEPTION)
8031 return -1;
8032 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008033 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 return -1;
8035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 }
8037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 *inpos = collendpos;
8039 break;
8040 default:
8041 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008042 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008046 if (PyBytes_Check(repunicode)) {
8047 /* Directly copy bytes result to output. */
8048 Py_ssize_t outsize = PyBytes_Size(*res);
8049 Py_ssize_t requiredsize;
8050 repsize = PyBytes_Size(repunicode);
8051 requiredsize = *respos + repsize;
8052 if (requiredsize > outsize)
8053 /* Make room for all additional bytes. */
8054 if (charmapencode_resize(res, respos, requiredsize)) {
8055 Py_DECREF(repunicode);
8056 return -1;
8057 }
8058 memcpy(PyBytes_AsString(*res) + *respos,
8059 PyBytes_AsString(repunicode), repsize);
8060 *respos += repsize;
8061 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008062 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008063 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008064 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008065 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008066 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008067 Py_DECREF(repunicode);
8068 return -1;
8069 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008070 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008071 data = PyUnicode_DATA(repunicode);
8072 kind = PyUnicode_KIND(repunicode);
8073 for (index = 0; index < repsize; index++) {
8074 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8075 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008077 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 return -1;
8079 }
8080 else if (x==enc_FAILED) {
8081 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008082 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 return -1;
8084 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 }
8086 *inpos = newpos;
8087 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 }
8089 return 0;
8090}
8091
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093_PyUnicode_EncodeCharmap(PyObject *unicode,
8094 PyObject *mapping,
8095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 /* output object */
8098 PyObject *res = NULL;
8099 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 PyObject *errorHandler = NULL;
8105 PyObject *exc = NULL;
8106 /* the following variable is used for caching string comparisons
8107 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8108 * 3=ignore, 4=xmlcharrefreplace */
8109 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008110 void *data;
8111 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112
Benjamin Petersonbac79492012-01-14 13:34:47 -05008113 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008114 return NULL;
8115 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008116 data = PyUnicode_DATA(unicode);
8117 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008118
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 /* Default to Latin-1 */
8120 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008121 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008123 /* allocate enough for a simple encoding without
8124 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008125 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008126 if (res == NULL)
8127 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008128 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008132 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 if (x==enc_EXCEPTION) /* error */
8136 goto onError;
8137 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008138 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 &exc,
8140 &known_errorHandler, &errorHandler, errors,
8141 &res, &respos)) {
8142 goto onError;
8143 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008144 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 else
8146 /* done with this character => adjust input position */
8147 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008151 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008152 if (_PyBytes_Resize(&res, respos) < 0)
8153 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008154
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008155 Py_XDECREF(exc);
8156 Py_XDECREF(errorHandler);
8157 return res;
8158
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008160 Py_XDECREF(res);
8161 Py_XDECREF(exc);
8162 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 return NULL;
8164}
8165
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008166/* Deprecated */
8167PyObject *
8168PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8169 Py_ssize_t size,
8170 PyObject *mapping,
8171 const char *errors)
8172{
8173 PyObject *result;
8174 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8175 if (unicode == NULL)
8176 return NULL;
8177 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8178 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008179 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008180}
8181
Alexander Belopolsky40018472011-02-26 01:02:56 +00008182PyObject *
8183PyUnicode_AsCharmapString(PyObject *unicode,
8184 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185{
8186 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 PyErr_BadArgument();
8188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008190 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191}
8192
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008194static void
8195make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008197 Py_ssize_t startpos, Py_ssize_t endpos,
8198 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008200 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201 *exceptionObject = _PyUnicodeTranslateError_Create(
8202 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 }
8204 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8206 goto onError;
8207 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8208 goto onError;
8209 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8210 goto onError;
8211 return;
8212 onError:
8213 Py_DECREF(*exceptionObject);
8214 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 }
8216}
8217
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008218/* error handling callback helper:
8219 build arguments, call the callback and check the arguments,
8220 put the result into newpos and return the replacement string, which
8221 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008222static PyObject *
8223unicode_translate_call_errorhandler(const char *errors,
8224 PyObject **errorHandler,
8225 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008226 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008227 Py_ssize_t startpos, Py_ssize_t endpos,
8228 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008230 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008232 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008233 PyObject *restuple;
8234 PyObject *resunicode;
8235
8236 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 }
8241
8242 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008246
8247 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008252 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 Py_DECREF(restuple);
8254 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255 }
8256 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 &resunicode, &i_newpos)) {
8258 Py_DECREF(restuple);
8259 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008261 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008263 else
8264 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8267 Py_DECREF(restuple);
8268 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008269 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 Py_INCREF(resunicode);
8271 Py_DECREF(restuple);
8272 return resunicode;
8273}
8274
8275/* Lookup the character ch in the mapping and put the result in result,
8276 which must be decrefed by the caller.
8277 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008278static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008279charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280{
Christian Heimes217cfd12007-12-02 14:31:20 +00008281 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 PyObject *x;
8283
8284 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 x = PyObject_GetItem(mapping, w);
8287 Py_DECREF(w);
8288 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8290 /* No mapping found means: use 1:1 mapping. */
8291 PyErr_Clear();
8292 *result = NULL;
8293 return 0;
8294 } else
8295 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296 }
8297 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 *result = x;
8299 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008301 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 long value = PyLong_AS_LONG(x);
8303 long max = PyUnicode_GetMax();
8304 if (value < 0 || value > max) {
8305 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008306 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 Py_DECREF(x);
8308 return -1;
8309 }
8310 *result = x;
8311 return 0;
8312 }
8313 else if (PyUnicode_Check(x)) {
8314 *result = x;
8315 return 0;
8316 }
8317 else {
8318 /* wrong return value */
8319 PyErr_SetString(PyExc_TypeError,
8320 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008321 Py_DECREF(x);
8322 return -1;
8323 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324}
8325/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 if not reallocate and adjust various state variables.
8327 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008333 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008334 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 /* exponentially overallocate to minimize reallocations */
8336 if (requiredsize < 2 * oldsize)
8337 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008338 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8339 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008341 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 }
8344 return 0;
8345}
8346/* lookup the character, put the result in the output string and adjust
8347 various state variables. Return a new reference to the object that
8348 was put in the output buffer in *result, or Py_None, if the mapping was
8349 undefined (in which case no character was written).
8350 The called must decref result.
8351 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008352static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8354 PyObject *mapping, Py_UCS4 **output,
8355 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8359 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008363 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364 }
8365 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008367 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 }
8371 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 Py_ssize_t repsize;
8373 if (PyUnicode_READY(*res) == -1)
8374 return -1;
8375 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 if (repsize==1) {
8377 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 }
8380 else if (repsize!=0) {
8381 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 Py_ssize_t requiredsize = *opos +
8383 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 Py_ssize_t i;
8386 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 for(i = 0; i < repsize; i++)
8389 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 }
8392 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 return 0;
8395}
8396
Alexander Belopolsky40018472011-02-26 01:02:56 +00008397PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398_PyUnicode_TranslateCharmap(PyObject *input,
8399 PyObject *mapping,
8400 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 /* input object */
8403 char *idata;
8404 Py_ssize_t size, i;
8405 int kind;
8406 /* output buffer */
8407 Py_UCS4 *output = NULL;
8408 Py_ssize_t osize;
8409 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 char *reason = "character maps to <undefined>";
8413 PyObject *errorHandler = NULL;
8414 PyObject *exc = NULL;
8415 /* the following variable is used for caching string comparisons
8416 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8417 * 3=ignore, 4=xmlcharrefreplace */
8418 int known_errorHandler = -1;
8419
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 PyErr_BadArgument();
8422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425 if (PyUnicode_READY(input) == -1)
8426 return NULL;
8427 idata = (char*)PyUnicode_DATA(input);
8428 kind = PyUnicode_KIND(input);
8429 size = PyUnicode_GET_LENGTH(input);
8430 i = 0;
8431
8432 if (size == 0) {
8433 Py_INCREF(input);
8434 return input;
8435 }
8436
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 /* allocate enough for a simple 1:1 translation without
8438 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439 osize = size;
8440 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8441 opos = 0;
8442 if (output == NULL) {
8443 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 /* try to encode it */
8449 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 if (charmaptranslate_output(input, i, mapping,
8451 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 Py_XDECREF(x);
8453 goto onError;
8454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 else { /* untranslatable character */
8459 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8460 Py_ssize_t repsize;
8461 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 Py_ssize_t collstart = i;
8465 Py_ssize_t collend = i+1;
8466 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 while (collend < size) {
8470 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 goto onError;
8472 Py_XDECREF(x);
8473 if (x!=Py_None)
8474 break;
8475 ++collend;
8476 }
8477 /* cache callback name lookup
8478 * (if not done yet, i.e. it's the first error) */
8479 if (known_errorHandler==-1) {
8480 if ((errors==NULL) || (!strcmp(errors, "strict")))
8481 known_errorHandler = 1;
8482 else if (!strcmp(errors, "replace"))
8483 known_errorHandler = 2;
8484 else if (!strcmp(errors, "ignore"))
8485 known_errorHandler = 3;
8486 else if (!strcmp(errors, "xmlcharrefreplace"))
8487 known_errorHandler = 4;
8488 else
8489 known_errorHandler = 0;
8490 }
8491 switch (known_errorHandler) {
8492 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008493 make_translate_exception(&exc,
8494 input, collstart, collend, reason);
8495 if (exc != NULL)
8496 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008497 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 case 2: /* replace */
8499 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 for (coll = collstart; coll<collend; coll++)
8501 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 /* fall through */
8503 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 break;
8506 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 /* generate replacement (temporarily (mis)uses i) */
8508 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 char buffer[2+29+1+1];
8510 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8512 if (charmaptranslate_makespace(&output, &osize,
8513 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 goto onError;
8515 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 break;
8520 default:
8521 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 reason, input, &exc,
8523 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008524 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008526 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008527 Py_DECREF(repunicode);
8528 goto onError;
8529 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 repsize = PyUnicode_GET_LENGTH(repunicode);
8532 if (charmaptranslate_makespace(&output, &osize,
8533 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 Py_DECREF(repunicode);
8535 goto onError;
8536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 for (uni2 = 0; repsize-->0; ++uni2)
8538 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8539 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008541 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008542 }
8543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8545 if (!res)
8546 goto onError;
8547 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 Py_XDECREF(exc);
8549 Py_XDECREF(errorHandler);
8550 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 Py_XDECREF(exc);
8555 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 return NULL;
8557}
8558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559/* Deprecated. Use PyUnicode_Translate instead. */
8560PyObject *
8561PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8562 Py_ssize_t size,
8563 PyObject *mapping,
8564 const char *errors)
8565{
Christian Heimes5f520f42012-09-11 14:03:25 +02008566 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8568 if (!unicode)
8569 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008570 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8571 Py_DECREF(unicode);
8572 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573}
8574
Alexander Belopolsky40018472011-02-26 01:02:56 +00008575PyObject *
8576PyUnicode_Translate(PyObject *str,
8577 PyObject *mapping,
8578 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579{
8580 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008581
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 str = PyUnicode_FromObject(str);
8583 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008584 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 Py_DECREF(str);
8587 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588}
Tim Petersced69f82003-09-16 20:30:58 +00008589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008591fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592{
8593 /* No need to call PyUnicode_READY(self) because this function is only
8594 called as a callback from fixup() which does it already. */
8595 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8596 const int kind = PyUnicode_KIND(self);
8597 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008598 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008599 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 Py_ssize_t i;
8601
8602 for (i = 0; i < len; ++i) {
8603 ch = PyUnicode_READ(kind, data, i);
8604 fixed = 0;
8605 if (ch > 127) {
8606 if (Py_UNICODE_ISSPACE(ch))
8607 fixed = ' ';
8608 else {
8609 const int decimal = Py_UNICODE_TODECIMAL(ch);
8610 if (decimal >= 0)
8611 fixed = '0' + decimal;
8612 }
8613 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008614 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008615 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 PyUnicode_WRITE(kind, data, i, fixed);
8617 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008618 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008619 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 }
8622
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008623 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624}
8625
8626PyObject *
8627_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8628{
8629 if (!PyUnicode_Check(unicode)) {
8630 PyErr_BadInternalCall();
8631 return NULL;
8632 }
8633 if (PyUnicode_READY(unicode) == -1)
8634 return NULL;
8635 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8636 /* If the string is already ASCII, just return the same string */
8637 Py_INCREF(unicode);
8638 return unicode;
8639 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008640 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641}
8642
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008643PyObject *
8644PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8645 Py_ssize_t length)
8646{
Victor Stinnerf0124502011-11-21 23:12:56 +01008647 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008648 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008649 Py_UCS4 maxchar;
8650 enum PyUnicode_Kind kind;
8651 void *data;
8652
Victor Stinner99d7ad02012-02-22 13:37:39 +01008653 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008654 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008655 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008656 if (ch > 127) {
8657 int decimal = Py_UNICODE_TODECIMAL(ch);
8658 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008659 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008660 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008661 }
8662 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008663
8664 /* Copy to a new string */
8665 decimal = PyUnicode_New(length, maxchar);
8666 if (decimal == NULL)
8667 return decimal;
8668 kind = PyUnicode_KIND(decimal);
8669 data = PyUnicode_DATA(decimal);
8670 /* Iterate over code points */
8671 for (i = 0; i < length; i++) {
8672 Py_UNICODE ch = s[i];
8673 if (ch > 127) {
8674 int decimal = Py_UNICODE_TODECIMAL(ch);
8675 if (decimal >= 0)
8676 ch = '0' + decimal;
8677 }
8678 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008680 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008681}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008682/* --- Decimal Encoder ---------------------------------------------------- */
8683
Alexander Belopolsky40018472011-02-26 01:02:56 +00008684int
8685PyUnicode_EncodeDecimal(Py_UNICODE *s,
8686 Py_ssize_t length,
8687 char *output,
8688 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008689{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008690 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008691 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008692 enum PyUnicode_Kind kind;
8693 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008694
8695 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 PyErr_BadArgument();
8697 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008698 }
8699
Victor Stinner42bf7752011-11-21 22:52:58 +01008700 unicode = PyUnicode_FromUnicode(s, length);
8701 if (unicode == NULL)
8702 return -1;
8703
Benjamin Petersonbac79492012-01-14 13:34:47 -05008704 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008705 Py_DECREF(unicode);
8706 return -1;
8707 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008708 kind = PyUnicode_KIND(unicode);
8709 data = PyUnicode_DATA(unicode);
8710
Victor Stinnerb84d7232011-11-22 01:50:07 +01008711 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008712 PyObject *exc;
8713 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008715 Py_ssize_t startpos;
8716
8717 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008718
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008720 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008721 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008723 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 decimal = Py_UNICODE_TODECIMAL(ch);
8725 if (decimal >= 0) {
8726 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008727 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 continue;
8729 }
8730 if (0 < ch && ch < 256) {
8731 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008732 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 continue;
8734 }
Victor Stinner6345be92011-11-25 20:09:01 +01008735
Victor Stinner42bf7752011-11-21 22:52:58 +01008736 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008737 exc = NULL;
8738 raise_encode_exception(&exc, "decimal", unicode,
8739 startpos, startpos+1,
8740 "invalid decimal Unicode string");
8741 Py_XDECREF(exc);
8742 Py_DECREF(unicode);
8743 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008744 }
8745 /* 0-terminate the output string */
8746 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008747 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008748 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008749}
8750
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751/* --- Helpers ------------------------------------------------------------ */
8752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008754any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 Py_ssize_t start,
8756 Py_ssize_t end)
8757{
8758 int kind1, kind2, kind;
8759 void *buf1, *buf2;
8760 Py_ssize_t len1, len2, result;
8761
8762 kind1 = PyUnicode_KIND(s1);
8763 kind2 = PyUnicode_KIND(s2);
8764 kind = kind1 > kind2 ? kind1 : kind2;
8765 buf1 = PyUnicode_DATA(s1);
8766 buf2 = PyUnicode_DATA(s2);
8767 if (kind1 != kind)
8768 buf1 = _PyUnicode_AsKind(s1, kind);
8769 if (!buf1)
8770 return -2;
8771 if (kind2 != kind)
8772 buf2 = _PyUnicode_AsKind(s2, kind);
8773 if (!buf2) {
8774 if (kind1 != kind) PyMem_Free(buf1);
8775 return -2;
8776 }
8777 len1 = PyUnicode_GET_LENGTH(s1);
8778 len2 = PyUnicode_GET_LENGTH(s2);
8779
Victor Stinner794d5672011-10-10 03:21:36 +02008780 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008781 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008782 case PyUnicode_1BYTE_KIND:
8783 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8784 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8785 else
8786 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8787 break;
8788 case PyUnicode_2BYTE_KIND:
8789 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8790 break;
8791 case PyUnicode_4BYTE_KIND:
8792 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8793 break;
8794 default:
8795 assert(0); result = -2;
8796 }
8797 }
8798 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008799 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008800 case PyUnicode_1BYTE_KIND:
8801 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8802 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8803 else
8804 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8805 break;
8806 case PyUnicode_2BYTE_KIND:
8807 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8808 break;
8809 case PyUnicode_4BYTE_KIND:
8810 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8811 break;
8812 default:
8813 assert(0); result = -2;
8814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 }
8816
8817 if (kind1 != kind)
8818 PyMem_Free(buf1);
8819 if (kind2 != kind)
8820 PyMem_Free(buf2);
8821
8822 return result;
8823}
8824
8825Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008826_PyUnicode_InsertThousandsGrouping(
8827 PyObject *unicode, Py_ssize_t index,
8828 Py_ssize_t n_buffer,
8829 void *digits, Py_ssize_t n_digits,
8830 Py_ssize_t min_width,
8831 const char *grouping, PyObject *thousands_sep,
8832 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833{
Victor Stinner41a863c2012-02-24 00:37:51 +01008834 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008835 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008836 Py_ssize_t thousands_sep_len;
8837 Py_ssize_t len;
8838
8839 if (unicode != NULL) {
8840 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008841 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008842 }
8843 else {
8844 kind = PyUnicode_1BYTE_KIND;
8845 data = NULL;
8846 }
8847 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8848 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8849 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8850 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008851 if (thousands_sep_kind < kind) {
8852 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8853 if (!thousands_sep_data)
8854 return -1;
8855 }
8856 else {
8857 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8858 if (!data)
8859 return -1;
8860 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008861 }
8862
Benjamin Petersonead6b532011-12-20 17:23:42 -06008863 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008865 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008866 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008867 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008868 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008869 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008870 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008871 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008872 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008874 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008877 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008878 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008879 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008880 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008881 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008883 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008884 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008885 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008886 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008887 break;
8888 default:
8889 assert(0);
8890 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008892 if (unicode != NULL && thousands_sep_kind != kind) {
8893 if (thousands_sep_kind < kind)
8894 PyMem_Free(thousands_sep_data);
8895 else
8896 PyMem_Free(data);
8897 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008898 if (unicode == NULL) {
8899 *maxchar = 127;
8900 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008901 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008902 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008903 }
8904 }
8905 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906}
8907
8908
Thomas Wouters477c8d52006-05-27 19:21:47 +00008909/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008910#define ADJUST_INDICES(start, end, len) \
8911 if (end > len) \
8912 end = len; \
8913 else if (end < 0) { \
8914 end += len; \
8915 if (end < 0) \
8916 end = 0; \
8917 } \
8918 if (start < 0) { \
8919 start += len; \
8920 if (start < 0) \
8921 start = 0; \
8922 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008923
Alexander Belopolsky40018472011-02-26 01:02:56 +00008924Py_ssize_t
8925PyUnicode_Count(PyObject *str,
8926 PyObject *substr,
8927 Py_ssize_t start,
8928 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008930 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008931 PyObject* str_obj;
8932 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 int kind1, kind2, kind;
8934 void *buf1 = NULL, *buf2 = NULL;
8935 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008936
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008937 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008938 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008940 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008941 if (!sub_obj) {
8942 Py_DECREF(str_obj);
8943 return -1;
8944 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008945 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008946 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 Py_DECREF(str_obj);
8948 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 }
Tim Petersced69f82003-09-16 20:30:58 +00008950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 kind1 = PyUnicode_KIND(str_obj);
8952 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008953 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008956 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008957 if (kind2 > kind) {
8958 Py_DECREF(sub_obj);
8959 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008960 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008961 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008962 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 if (!buf2)
8965 goto onError;
8966 len1 = PyUnicode_GET_LENGTH(str_obj);
8967 len2 = PyUnicode_GET_LENGTH(sub_obj);
8968
8969 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008970 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008972 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8973 result = asciilib_count(
8974 ((Py_UCS1*)buf1) + start, end - start,
8975 buf2, len2, PY_SSIZE_T_MAX
8976 );
8977 else
8978 result = ucs1lib_count(
8979 ((Py_UCS1*)buf1) + start, end - start,
8980 buf2, len2, PY_SSIZE_T_MAX
8981 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 break;
8983 case PyUnicode_2BYTE_KIND:
8984 result = ucs2lib_count(
8985 ((Py_UCS2*)buf1) + start, end - start,
8986 buf2, len2, PY_SSIZE_T_MAX
8987 );
8988 break;
8989 case PyUnicode_4BYTE_KIND:
8990 result = ucs4lib_count(
8991 ((Py_UCS4*)buf1) + start, end - start,
8992 buf2, len2, PY_SSIZE_T_MAX
8993 );
8994 break;
8995 default:
8996 assert(0); result = 0;
8997 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008998
8999 Py_DECREF(sub_obj);
9000 Py_DECREF(str_obj);
9001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 if (kind2 != kind)
9003 PyMem_Free(buf2);
9004
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 onError:
9007 Py_DECREF(sub_obj);
9008 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (kind2 != kind && buf2)
9010 PyMem_Free(buf2);
9011 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012}
9013
Alexander Belopolsky40018472011-02-26 01:02:56 +00009014Py_ssize_t
9015PyUnicode_Find(PyObject *str,
9016 PyObject *sub,
9017 Py_ssize_t start,
9018 Py_ssize_t end,
9019 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009021 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009022
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009024 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009026 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009027 if (!sub) {
9028 Py_DECREF(str);
9029 return -2;
9030 }
9031 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9032 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 Py_DECREF(str);
9034 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035 }
Tim Petersced69f82003-09-16 20:30:58 +00009036
Victor Stinner794d5672011-10-10 03:21:36 +02009037 result = any_find_slice(direction,
9038 str, sub, start, end
9039 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009040
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009042 Py_DECREF(sub);
9043
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 return result;
9045}
9046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047Py_ssize_t
9048PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9049 Py_ssize_t start, Py_ssize_t end,
9050 int direction)
9051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009053 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 if (PyUnicode_READY(str) == -1)
9055 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009056 if (start < 0 || end < 0) {
9057 PyErr_SetString(PyExc_IndexError, "string index out of range");
9058 return -2;
9059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 if (end > PyUnicode_GET_LENGTH(str))
9061 end = PyUnicode_GET_LENGTH(str);
9062 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009063 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9064 kind, end-start, ch, direction);
9065 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009067 else
9068 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069}
9070
Alexander Belopolsky40018472011-02-26 01:02:56 +00009071static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009072tailmatch(PyObject *self,
9073 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009074 Py_ssize_t start,
9075 Py_ssize_t end,
9076 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 int kind_self;
9079 int kind_sub;
9080 void *data_self;
9081 void *data_sub;
9082 Py_ssize_t offset;
9083 Py_ssize_t i;
9084 Py_ssize_t end_sub;
9085
9086 if (PyUnicode_READY(self) == -1 ||
9087 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009088 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089
9090 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 return 1;
9092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9094 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 kind_self = PyUnicode_KIND(self);
9099 data_self = PyUnicode_DATA(self);
9100 kind_sub = PyUnicode_KIND(substring);
9101 data_sub = PyUnicode_DATA(substring);
9102 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9103
9104 if (direction > 0)
9105 offset = end;
9106 else
9107 offset = start;
9108
9109 if (PyUnicode_READ(kind_self, data_self, offset) ==
9110 PyUnicode_READ(kind_sub, data_sub, 0) &&
9111 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9112 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9113 /* If both are of the same kind, memcmp is sufficient */
9114 if (kind_self == kind_sub) {
9115 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009116 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 data_sub,
9118 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009119 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 }
9121 /* otherwise we have to compare each character by first accesing it */
9122 else {
9123 /* We do not need to compare 0 and len(substring)-1 because
9124 the if statement above ensured already that they are equal
9125 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 for (i = 1; i < end_sub; ++i) {
9127 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9128 PyUnicode_READ(kind_sub, data_sub, i))
9129 return 0;
9130 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133 }
9134
9135 return 0;
9136}
9137
Alexander Belopolsky40018472011-02-26 01:02:56 +00009138Py_ssize_t
9139PyUnicode_Tailmatch(PyObject *str,
9140 PyObject *substr,
9141 Py_ssize_t start,
9142 Py_ssize_t end,
9143 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009145 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009146
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 str = PyUnicode_FromObject(str);
9148 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 substr = PyUnicode_FromObject(substr);
9151 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 Py_DECREF(str);
9153 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 }
Tim Petersced69f82003-09-16 20:30:58 +00009155
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009156 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 Py_DECREF(str);
9159 Py_DECREF(substr);
9160 return result;
9161}
9162
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163/* Apply fixfct filter to the Unicode object self and return a
9164 reference to the modified object */
9165
Alexander Belopolsky40018472011-02-26 01:02:56 +00009166static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009167fixup(PyObject *self,
9168 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 PyObject *u;
9171 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009172 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009174 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009176 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009177 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 /* fix functions return the new maximum character in a string,
9180 if the kind of the resulting unicode object does not change,
9181 everything is fine. Otherwise we need to change the string kind
9182 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009183 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009184
9185 if (maxchar_new == 0) {
9186 /* no changes */;
9187 if (PyUnicode_CheckExact(self)) {
9188 Py_DECREF(u);
9189 Py_INCREF(self);
9190 return self;
9191 }
9192 else
9193 return u;
9194 }
9195
Victor Stinnere6abb482012-05-02 01:15:40 +02009196 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197
Victor Stinnereaab6042011-12-11 22:22:39 +01009198 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009200
9201 /* In case the maximum character changed, we need to
9202 convert the string to the new category. */
9203 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9204 if (v == NULL) {
9205 Py_DECREF(u);
9206 return NULL;
9207 }
9208 if (maxchar_new > maxchar_old) {
9209 /* If the maxchar increased so that the kind changed, not all
9210 characters are representable anymore and we need to fix the
9211 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009212 _PyUnicode_FastCopyCharacters(v, 0,
9213 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009214 maxchar_old = fixfct(v);
9215 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 }
9217 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009218 _PyUnicode_FastCopyCharacters(v, 0,
9219 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009221 Py_DECREF(u);
9222 assert(_PyUnicode_CheckConsistency(v, 1));
9223 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224}
9225
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009226static PyObject *
9227ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009229 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9230 char *resdata, *data = PyUnicode_DATA(self);
9231 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009232
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009233 res = PyUnicode_New(len, 127);
9234 if (res == NULL)
9235 return NULL;
9236 resdata = PyUnicode_DATA(res);
9237 if (lower)
9238 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009240 _Py_bytes_upper(resdata, data, len);
9241 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242}
9243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009245handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009247 Py_ssize_t j;
9248 int final_sigma;
9249 Py_UCS4 c;
9250 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009251
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009252 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9253
9254 where ! is a negation and \p{xxx} is a character with property xxx.
9255 */
9256 for (j = i - 1; j >= 0; j--) {
9257 c = PyUnicode_READ(kind, data, j);
9258 if (!_PyUnicode_IsCaseIgnorable(c))
9259 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009261 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9262 if (final_sigma) {
9263 for (j = i + 1; j < length; j++) {
9264 c = PyUnicode_READ(kind, data, j);
9265 if (!_PyUnicode_IsCaseIgnorable(c))
9266 break;
9267 }
9268 final_sigma = j == length || !_PyUnicode_IsCased(c);
9269 }
9270 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271}
9272
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009273static int
9274lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9275 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009277 /* Obscure special case. */
9278 if (c == 0x3A3) {
9279 mapped[0] = handle_capital_sigma(kind, data, length, i);
9280 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009282 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283}
9284
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009285static Py_ssize_t
9286do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009288 Py_ssize_t i, k = 0;
9289 int n_res, j;
9290 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009291
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009292 c = PyUnicode_READ(kind, data, 0);
9293 n_res = _PyUnicode_ToUpperFull(c, mapped);
9294 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009295 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009298 for (i = 1; i < length; i++) {
9299 c = PyUnicode_READ(kind, data, i);
9300 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9301 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009302 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009303 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009304 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009305 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009306 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307}
9308
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009309static Py_ssize_t
9310do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9311 Py_ssize_t i, k = 0;
9312
9313 for (i = 0; i < length; i++) {
9314 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9315 int n_res, j;
9316 if (Py_UNICODE_ISUPPER(c)) {
9317 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9318 }
9319 else if (Py_UNICODE_ISLOWER(c)) {
9320 n_res = _PyUnicode_ToUpperFull(c, mapped);
9321 }
9322 else {
9323 n_res = 1;
9324 mapped[0] = c;
9325 }
9326 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009327 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009328 res[k++] = mapped[j];
9329 }
9330 }
9331 return k;
9332}
9333
9334static Py_ssize_t
9335do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9336 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009338 Py_ssize_t i, k = 0;
9339
9340 for (i = 0; i < length; i++) {
9341 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9342 int n_res, j;
9343 if (lower)
9344 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9345 else
9346 n_res = _PyUnicode_ToUpperFull(c, mapped);
9347 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009348 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009349 res[k++] = mapped[j];
9350 }
9351 }
9352 return k;
9353}
9354
9355static Py_ssize_t
9356do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9357{
9358 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9359}
9360
9361static Py_ssize_t
9362do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9363{
9364 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9365}
9366
Benjamin Petersone51757f2012-01-12 21:10:29 -05009367static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009368do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9369{
9370 Py_ssize_t i, k = 0;
9371
9372 for (i = 0; i < length; i++) {
9373 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9374 Py_UCS4 mapped[3];
9375 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9376 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009377 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009378 res[k++] = mapped[j];
9379 }
9380 }
9381 return k;
9382}
9383
9384static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009385do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9386{
9387 Py_ssize_t i, k = 0;
9388 int previous_is_cased;
9389
9390 previous_is_cased = 0;
9391 for (i = 0; i < length; i++) {
9392 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9393 Py_UCS4 mapped[3];
9394 int n_res, j;
9395
9396 if (previous_is_cased)
9397 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9398 else
9399 n_res = _PyUnicode_ToTitleFull(c, mapped);
9400
9401 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009402 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009403 res[k++] = mapped[j];
9404 }
9405
9406 previous_is_cased = _PyUnicode_IsCased(c);
9407 }
9408 return k;
9409}
9410
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009411static PyObject *
9412case_operation(PyObject *self,
9413 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9414{
9415 PyObject *res = NULL;
9416 Py_ssize_t length, newlength = 0;
9417 int kind, outkind;
9418 void *data, *outdata;
9419 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9420
Benjamin Petersoneea48462012-01-16 14:28:50 -05009421 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009422
9423 kind = PyUnicode_KIND(self);
9424 data = PyUnicode_DATA(self);
9425 length = PyUnicode_GET_LENGTH(self);
9426 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9427 if (tmp == NULL)
9428 return PyErr_NoMemory();
9429 newlength = perform(kind, data, length, tmp, &maxchar);
9430 res = PyUnicode_New(newlength, maxchar);
9431 if (res == NULL)
9432 goto leave;
9433 tmpend = tmp + newlength;
9434 outdata = PyUnicode_DATA(res);
9435 outkind = PyUnicode_KIND(res);
9436 switch (outkind) {
9437 case PyUnicode_1BYTE_KIND:
9438 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9439 break;
9440 case PyUnicode_2BYTE_KIND:
9441 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9442 break;
9443 case PyUnicode_4BYTE_KIND:
9444 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9445 break;
9446 default:
9447 assert(0);
9448 break;
9449 }
9450 leave:
9451 PyMem_FREE(tmp);
9452 return res;
9453}
9454
Tim Peters8ce9f162004-08-27 01:49:32 +00009455PyObject *
9456PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009459 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009461 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009462 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9463 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009464 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009466 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009468 int use_memcpy;
9469 unsigned char *res_data = NULL, *sep_data = NULL;
9470 PyObject *last_obj;
9471 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472
Tim Peters05eba1f2004-08-27 21:32:02 +00009473 fseq = PySequence_Fast(seq, "");
9474 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009475 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009476 }
9477
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009478 /* NOTE: the following code can't call back into Python code,
9479 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009480 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009481
Tim Peters05eba1f2004-08-27 21:32:02 +00009482 seqlen = PySequence_Fast_GET_SIZE(fseq);
9483 /* If empty sequence, return u"". */
9484 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009485 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009486 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009487 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009488
Tim Peters05eba1f2004-08-27 21:32:02 +00009489 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009490 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009491 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009492 if (seqlen == 1) {
9493 if (PyUnicode_CheckExact(items[0])) {
9494 res = items[0];
9495 Py_INCREF(res);
9496 Py_DECREF(fseq);
9497 return res;
9498 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009499 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009500 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009501 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009502 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009503 /* Set up sep and seplen */
9504 if (separator == NULL) {
9505 /* fall back to a blank space separator */
9506 sep = PyUnicode_FromOrdinal(' ');
9507 if (!sep)
9508 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009509 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009510 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009511 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009512 else {
9513 if (!PyUnicode_Check(separator)) {
9514 PyErr_Format(PyExc_TypeError,
9515 "separator: expected str instance,"
9516 " %.80s found",
9517 Py_TYPE(separator)->tp_name);
9518 goto onError;
9519 }
9520 if (PyUnicode_READY(separator))
9521 goto onError;
9522 sep = separator;
9523 seplen = PyUnicode_GET_LENGTH(separator);
9524 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9525 /* inc refcount to keep this code path symmetric with the
9526 above case of a blank separator */
9527 Py_INCREF(sep);
9528 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009529 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009530 }
9531
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009532 /* There are at least two things to join, or else we have a subclass
9533 * of str in the sequence.
9534 * Do a pre-pass to figure out the total amount of space we'll
9535 * need (sz), and see whether all argument are strings.
9536 */
9537 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009538#ifdef Py_DEBUG
9539 use_memcpy = 0;
9540#else
9541 use_memcpy = 1;
9542#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009543 for (i = 0; i < seqlen; i++) {
9544 const Py_ssize_t old_sz = sz;
9545 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 if (!PyUnicode_Check(item)) {
9547 PyErr_Format(PyExc_TypeError,
9548 "sequence item %zd: expected str instance,"
9549 " %.80s found",
9550 i, Py_TYPE(item)->tp_name);
9551 goto onError;
9552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 if (PyUnicode_READY(item) == -1)
9554 goto onError;
9555 sz += PyUnicode_GET_LENGTH(item);
9556 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009557 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009558 if (i != 0)
9559 sz += seplen;
9560 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9561 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009563 goto onError;
9564 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009565 if (use_memcpy && last_obj != NULL) {
9566 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9567 use_memcpy = 0;
9568 }
9569 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009570 }
Tim Petersced69f82003-09-16 20:30:58 +00009571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009573 if (res == NULL)
9574 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009575
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009576 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009577#ifdef Py_DEBUG
9578 use_memcpy = 0;
9579#else
9580 if (use_memcpy) {
9581 res_data = PyUnicode_1BYTE_DATA(res);
9582 kind = PyUnicode_KIND(res);
9583 if (seplen != 0)
9584 sep_data = PyUnicode_1BYTE_DATA(sep);
9585 }
9586#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009587 if (use_memcpy) {
9588 for (i = 0; i < seqlen; ++i) {
9589 Py_ssize_t itemlen;
9590 item = items[i];
9591
9592 /* Copy item, and maybe the separator. */
9593 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009594 Py_MEMCPY(res_data,
9595 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009596 kind * seplen);
9597 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009598 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009599
9600 itemlen = PyUnicode_GET_LENGTH(item);
9601 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009602 Py_MEMCPY(res_data,
9603 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009604 kind * itemlen);
9605 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009606 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009607 }
9608 assert(res_data == PyUnicode_1BYTE_DATA(res)
9609 + kind * PyUnicode_GET_LENGTH(res));
9610 }
9611 else {
9612 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9613 Py_ssize_t itemlen;
9614 item = items[i];
9615
9616 /* Copy item, and maybe the separator. */
9617 if (i && seplen != 0) {
9618 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9619 res_offset += seplen;
9620 }
9621
9622 itemlen = PyUnicode_GET_LENGTH(item);
9623 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009624 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009625 res_offset += itemlen;
9626 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009627 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009628 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009629 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009630
Tim Peters05eba1f2004-08-27 21:32:02 +00009631 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009633 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009637 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009639 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 return NULL;
9641}
9642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643#define FILL(kind, data, value, start, length) \
9644 do { \
9645 Py_ssize_t i_ = 0; \
9646 assert(kind != PyUnicode_WCHAR_KIND); \
9647 switch ((kind)) { \
9648 case PyUnicode_1BYTE_KIND: { \
9649 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009650 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 break; \
9652 } \
9653 case PyUnicode_2BYTE_KIND: { \
9654 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9655 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9656 break; \
9657 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009658 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9660 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9661 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009662 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 } \
9664 } \
9665 } while (0)
9666
Victor Stinnerd3f08822012-05-29 12:57:52 +02009667void
9668_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9669 Py_UCS4 fill_char)
9670{
9671 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9672 const void *data = PyUnicode_DATA(unicode);
9673 assert(PyUnicode_IS_READY(unicode));
9674 assert(unicode_modifiable(unicode));
9675 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9676 assert(start >= 0);
9677 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9678 FILL(kind, data, fill_char, start, length);
9679}
9680
Victor Stinner3fe55312012-01-04 00:33:50 +01009681Py_ssize_t
9682PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9683 Py_UCS4 fill_char)
9684{
9685 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009686
9687 if (!PyUnicode_Check(unicode)) {
9688 PyErr_BadInternalCall();
9689 return -1;
9690 }
9691 if (PyUnicode_READY(unicode) == -1)
9692 return -1;
9693 if (unicode_check_modifiable(unicode))
9694 return -1;
9695
Victor Stinnerd3f08822012-05-29 12:57:52 +02009696 if (start < 0) {
9697 PyErr_SetString(PyExc_IndexError, "string index out of range");
9698 return -1;
9699 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009700 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9701 PyErr_SetString(PyExc_ValueError,
9702 "fill character is bigger than "
9703 "the string maximum character");
9704 return -1;
9705 }
9706
9707 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9708 length = Py_MIN(maxlen, length);
9709 if (length <= 0)
9710 return 0;
9711
Victor Stinnerd3f08822012-05-29 12:57:52 +02009712 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009713 return length;
9714}
9715
Victor Stinner9310abb2011-10-05 00:59:23 +02009716static PyObject *
9717pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009718 Py_ssize_t left,
9719 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 PyObject *u;
9723 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009724 int kind;
9725 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726
9727 if (left < 0)
9728 left = 0;
9729 if (right < 0)
9730 right = 0;
9731
Victor Stinnerc4b49542011-12-11 22:44:26 +01009732 if (left == 0 && right == 0)
9733 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9736 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009737 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9738 return NULL;
9739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009741 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009743 if (!u)
9744 return NULL;
9745
9746 kind = PyUnicode_KIND(u);
9747 data = PyUnicode_DATA(u);
9748 if (left)
9749 FILL(kind, data, fill, 0, left);
9750 if (right)
9751 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009752 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009753 assert(_PyUnicode_CheckConsistency(u, 1));
9754 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755}
9756
Alexander Belopolsky40018472011-02-26 01:02:56 +00009757PyObject *
9758PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761
9762 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009763 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009765 if (PyUnicode_READY(string) == -1) {
9766 Py_DECREF(string);
9767 return NULL;
9768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769
Benjamin Petersonead6b532011-12-20 17:23:42 -06009770 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 if (PyUnicode_IS_ASCII(string))
9773 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009774 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009775 PyUnicode_GET_LENGTH(string), keepends);
9776 else
9777 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009779 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 break;
9781 case PyUnicode_2BYTE_KIND:
9782 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009783 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 PyUnicode_GET_LENGTH(string), keepends);
9785 break;
9786 case PyUnicode_4BYTE_KIND:
9787 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 PyUnicode_GET_LENGTH(string), keepends);
9790 break;
9791 default:
9792 assert(0);
9793 list = 0;
9794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795 Py_DECREF(string);
9796 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797}
9798
Alexander Belopolsky40018472011-02-26 01:02:56 +00009799static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009800split(PyObject *self,
9801 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009802 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 int kind1, kind2, kind;
9805 void *buf1, *buf2;
9806 Py_ssize_t len1, len2;
9807 PyObject* out;
9808
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009810 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 if (PyUnicode_READY(self) == -1)
9813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009816 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009818 if (PyUnicode_IS_ASCII(self))
9819 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009821 PyUnicode_GET_LENGTH(self), maxcount
9822 );
9823 else
9824 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009825 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009826 PyUnicode_GET_LENGTH(self), maxcount
9827 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 case PyUnicode_2BYTE_KIND:
9829 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 PyUnicode_GET_LENGTH(self), maxcount
9832 );
9833 case PyUnicode_4BYTE_KIND:
9834 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009835 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 PyUnicode_GET_LENGTH(self), maxcount
9837 );
9838 default:
9839 assert(0);
9840 return NULL;
9841 }
9842
9843 if (PyUnicode_READY(substring) == -1)
9844 return NULL;
9845
9846 kind1 = PyUnicode_KIND(self);
9847 kind2 = PyUnicode_KIND(substring);
9848 kind = kind1 > kind2 ? kind1 : kind2;
9849 buf1 = PyUnicode_DATA(self);
9850 buf2 = PyUnicode_DATA(substring);
9851 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009852 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 if (!buf1)
9854 return NULL;
9855 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009856 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (!buf2) {
9858 if (kind1 != kind) PyMem_Free(buf1);
9859 return NULL;
9860 }
9861 len1 = PyUnicode_GET_LENGTH(self);
9862 len2 = PyUnicode_GET_LENGTH(substring);
9863
Benjamin Petersonead6b532011-12-20 17:23:42 -06009864 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009866 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9867 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009869 else
9870 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009871 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 break;
9873 case PyUnicode_2BYTE_KIND:
9874 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 break;
9877 case PyUnicode_4BYTE_KIND:
9878 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009879 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 break;
9881 default:
9882 out = NULL;
9883 }
9884 if (kind1 != kind)
9885 PyMem_Free(buf1);
9886 if (kind2 != kind)
9887 PyMem_Free(buf2);
9888 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889}
9890
Alexander Belopolsky40018472011-02-26 01:02:56 +00009891static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009892rsplit(PyObject *self,
9893 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009894 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 int kind1, kind2, kind;
9897 void *buf1, *buf2;
9898 Py_ssize_t len1, len2;
9899 PyObject* out;
9900
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009901 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009902 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 if (PyUnicode_READY(self) == -1)
9905 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009908 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009910 if (PyUnicode_IS_ASCII(self))
9911 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009913 PyUnicode_GET_LENGTH(self), maxcount
9914 );
9915 else
9916 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009917 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009918 PyUnicode_GET_LENGTH(self), maxcount
9919 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 case PyUnicode_2BYTE_KIND:
9921 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 PyUnicode_GET_LENGTH(self), maxcount
9924 );
9925 case PyUnicode_4BYTE_KIND:
9926 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 PyUnicode_GET_LENGTH(self), maxcount
9929 );
9930 default:
9931 assert(0);
9932 return NULL;
9933 }
9934
9935 if (PyUnicode_READY(substring) == -1)
9936 return NULL;
9937
9938 kind1 = PyUnicode_KIND(self);
9939 kind2 = PyUnicode_KIND(substring);
9940 kind = kind1 > kind2 ? kind1 : kind2;
9941 buf1 = PyUnicode_DATA(self);
9942 buf2 = PyUnicode_DATA(substring);
9943 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009944 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (!buf1)
9946 return NULL;
9947 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009948 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 if (!buf2) {
9950 if (kind1 != kind) PyMem_Free(buf1);
9951 return NULL;
9952 }
9953 len1 = PyUnicode_GET_LENGTH(self);
9954 len2 = PyUnicode_GET_LENGTH(substring);
9955
Benjamin Petersonead6b532011-12-20 17:23:42 -06009956 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009958 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9959 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009960 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009961 else
9962 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 break;
9965 case PyUnicode_2BYTE_KIND:
9966 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009967 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 break;
9969 case PyUnicode_4BYTE_KIND:
9970 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009971 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 break;
9973 default:
9974 out = NULL;
9975 }
9976 if (kind1 != kind)
9977 PyMem_Free(buf1);
9978 if (kind2 != kind)
9979 PyMem_Free(buf2);
9980 return out;
9981}
9982
9983static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009984anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9985 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009987 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009989 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9990 return asciilib_find(buf1, len1, buf2, len2, offset);
9991 else
9992 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 case PyUnicode_2BYTE_KIND:
9994 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9995 case PyUnicode_4BYTE_KIND:
9996 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9997 }
9998 assert(0);
9999 return -1;
10000}
10001
10002static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010003anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10004 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010006 switch (kind) {
10007 case PyUnicode_1BYTE_KIND:
10008 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10009 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10010 else
10011 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10012 case PyUnicode_2BYTE_KIND:
10013 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10014 case PyUnicode_4BYTE_KIND:
10015 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10016 }
10017 assert(0);
10018 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010019}
10020
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010021static void
10022replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10023 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10024{
10025 int kind = PyUnicode_KIND(u);
10026 void *data = PyUnicode_DATA(u);
10027 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10028 if (kind == PyUnicode_1BYTE_KIND) {
10029 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10030 (Py_UCS1 *)data + len,
10031 u1, u2, maxcount);
10032 }
10033 else if (kind == PyUnicode_2BYTE_KIND) {
10034 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10035 (Py_UCS2 *)data + len,
10036 u1, u2, maxcount);
10037 }
10038 else {
10039 assert(kind == PyUnicode_4BYTE_KIND);
10040 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10041 (Py_UCS4 *)data + len,
10042 u1, u2, maxcount);
10043 }
10044}
10045
Alexander Belopolsky40018472011-02-26 01:02:56 +000010046static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047replace(PyObject *self, PyObject *str1,
10048 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 PyObject *u;
10051 char *sbuf = PyUnicode_DATA(self);
10052 char *buf1 = PyUnicode_DATA(str1);
10053 char *buf2 = PyUnicode_DATA(str2);
10054 int srelease = 0, release1 = 0, release2 = 0;
10055 int skind = PyUnicode_KIND(self);
10056 int kind1 = PyUnicode_KIND(str1);
10057 int kind2 = PyUnicode_KIND(str2);
10058 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10059 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10060 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010061 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010062 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063
10064 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010065 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010067 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068
Victor Stinner59de0ee2011-10-07 10:01:28 +020010069 if (str1 == str2)
10070 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071
Victor Stinner49a0a212011-10-12 23:46:10 +020010072 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010073 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10074 if (maxchar < maxchar_str1)
10075 /* substring too wide to be present */
10076 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010077 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10078 /* Replacing str1 with str2 may cause a maxchar reduction in the
10079 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010080 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010081 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010084 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010086 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010088 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010089 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010090 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010091
Victor Stinner69ed0f42013-04-09 21:48:24 +020010092 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010093 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010094 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010095 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010096 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010098 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010100
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010101 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10102 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010103 }
10104 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 int rkind = skind;
10106 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010107 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (kind1 < rkind) {
10110 /* widen substring */
10111 buf1 = _PyUnicode_AsKind(str1, rkind);
10112 if (!buf1) goto error;
10113 release1 = 1;
10114 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010115 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010116 if (i < 0)
10117 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 if (rkind > kind2) {
10119 /* widen replacement */
10120 buf2 = _PyUnicode_AsKind(str2, rkind);
10121 if (!buf2) goto error;
10122 release2 = 1;
10123 }
10124 else if (rkind < kind2) {
10125 /* widen self and buf1 */
10126 rkind = kind2;
10127 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010128 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 sbuf = _PyUnicode_AsKind(self, rkind);
10130 if (!sbuf) goto error;
10131 srelease = 1;
10132 buf1 = _PyUnicode_AsKind(str1, rkind);
10133 if (!buf1) goto error;
10134 release1 = 1;
10135 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010136 u = PyUnicode_New(slen, maxchar);
10137 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010139 assert(PyUnicode_KIND(u) == rkind);
10140 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010141
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010142 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010143 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010144 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010146 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010148
10149 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010151 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010152 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010153 if (i == -1)
10154 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010155 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010157 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010161 }
10162 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010164 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 int rkind = skind;
10166 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010169 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 buf1 = _PyUnicode_AsKind(str1, rkind);
10171 if (!buf1) goto error;
10172 release1 = 1;
10173 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010174 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010175 if (n == 0)
10176 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010178 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 buf2 = _PyUnicode_AsKind(str2, rkind);
10180 if (!buf2) goto error;
10181 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010184 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 rkind = kind2;
10186 sbuf = _PyUnicode_AsKind(self, rkind);
10187 if (!sbuf) goto error;
10188 srelease = 1;
10189 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010190 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 buf1 = _PyUnicode_AsKind(str1, rkind);
10192 if (!buf1) goto error;
10193 release1 = 1;
10194 }
10195 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10196 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010197 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 PyErr_SetString(PyExc_OverflowError,
10199 "replace string is too long");
10200 goto error;
10201 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010202 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010203 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010204 _Py_INCREF_UNICODE_EMPTY();
10205 if (!unicode_empty)
10206 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010207 u = unicode_empty;
10208 goto done;
10209 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010210 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 PyErr_SetString(PyExc_OverflowError,
10212 "replace string is too long");
10213 goto error;
10214 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010215 u = PyUnicode_New(new_size, maxchar);
10216 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010218 assert(PyUnicode_KIND(u) == rkind);
10219 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 ires = i = 0;
10221 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222 while (n-- > 0) {
10223 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010225 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010226 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010227 if (j == -1)
10228 break;
10229 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010230 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010231 memcpy(res + rkind * ires,
10232 sbuf + rkind * i,
10233 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010235 }
10236 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010238 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010240 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010246 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010247 memcpy(res + rkind * ires,
10248 sbuf + rkind * i,
10249 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010250 }
10251 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010252 /* interleave */
10253 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010254 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010256 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010258 if (--n <= 0)
10259 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010260 memcpy(res + rkind * ires,
10261 sbuf + rkind * i,
10262 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 ires++;
10264 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010265 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010266 memcpy(res + rkind * ires,
10267 sbuf + rkind * i,
10268 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010269 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010270 }
10271
10272 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010273 unicode_adjust_maxchar(&u);
10274 if (u == NULL)
10275 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010277
10278 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 if (srelease)
10280 PyMem_FREE(sbuf);
10281 if (release1)
10282 PyMem_FREE(buf1);
10283 if (release2)
10284 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010285 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010287
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010289 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 if (srelease)
10291 PyMem_FREE(sbuf);
10292 if (release1)
10293 PyMem_FREE(buf1);
10294 if (release2)
10295 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010296 return unicode_result_unchanged(self);
10297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 error:
10299 if (srelease && sbuf)
10300 PyMem_FREE(sbuf);
10301 if (release1 && buf1)
10302 PyMem_FREE(buf1);
10303 if (release2 && buf2)
10304 PyMem_FREE(buf2);
10305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306}
10307
10308/* --- Unicode Object Methods --------------------------------------------- */
10309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010310PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010311 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312\n\
10313Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010314characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315
10316static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010317unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010319 if (PyUnicode_READY(self) == -1)
10320 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010321 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322}
10323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010324PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010325 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326\n\
10327Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010328have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329
10330static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010331unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010333 if (PyUnicode_READY(self) == -1)
10334 return NULL;
10335 if (PyUnicode_GET_LENGTH(self) == 0)
10336 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010337 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338}
10339
Benjamin Petersond5890c82012-01-14 13:23:30 -050010340PyDoc_STRVAR(casefold__doc__,
10341 "S.casefold() -> str\n\
10342\n\
10343Return a version of S suitable for caseless comparisons.");
10344
10345static PyObject *
10346unicode_casefold(PyObject *self)
10347{
10348 if (PyUnicode_READY(self) == -1)
10349 return NULL;
10350 if (PyUnicode_IS_ASCII(self))
10351 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010352 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010353}
10354
10355
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010356/* Argument converter. Coerces to a single unicode character */
10357
10358static int
10359convert_uc(PyObject *obj, void *addr)
10360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010362 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010363
Benjamin Peterson14339b62009-01-31 16:36:08 +000010364 uniobj = PyUnicode_FromObject(obj);
10365 if (uniobj == NULL) {
10366 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010367 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010368 return 0;
10369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010371 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010372 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010373 Py_DECREF(uniobj);
10374 return 0;
10375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010377 Py_DECREF(uniobj);
10378 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010379}
10380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010381PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010382 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010384Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010385done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
10387static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010388unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010390 Py_ssize_t marg, left;
10391 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 Py_UCS4 fillchar = ' ';
10393
Victor Stinnere9a29352011-10-01 02:14:59 +020010394 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396
Benjamin Petersonbac79492012-01-14 13:34:47 -050010397 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398 return NULL;
10399
Victor Stinnerc4b49542011-12-11 22:44:26 +010010400 if (PyUnicode_GET_LENGTH(self) >= width)
10401 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402
Victor Stinnerc4b49542011-12-11 22:44:26 +010010403 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404 left = marg / 2 + (marg & width & 1);
10405
Victor Stinner9310abb2011-10-05 00:59:23 +020010406 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407}
10408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409/* This function assumes that str1 and str2 are readied by the caller. */
10410
Marc-André Lemburge5034372000-08-08 08:04:29 +000010411static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010412unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010413{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010414#define COMPARE(TYPE1, TYPE2) \
10415 do { \
10416 TYPE1* p1 = (TYPE1 *)data1; \
10417 TYPE2* p2 = (TYPE2 *)data2; \
10418 TYPE1* end = p1 + len; \
10419 Py_UCS4 c1, c2; \
10420 for (; p1 != end; p1++, p2++) { \
10421 c1 = *p1; \
10422 c2 = *p2; \
10423 if (c1 != c2) \
10424 return (c1 < c2) ? -1 : 1; \
10425 } \
10426 } \
10427 while (0)
10428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 int kind1, kind2;
10430 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010431 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 kind1 = PyUnicode_KIND(str1);
10434 kind2 = PyUnicode_KIND(str2);
10435 data1 = PyUnicode_DATA(str1);
10436 data2 = PyUnicode_DATA(str2);
10437 len1 = PyUnicode_GET_LENGTH(str1);
10438 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010439 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010440
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010441 switch(kind1) {
10442 case PyUnicode_1BYTE_KIND:
10443 {
10444 switch(kind2) {
10445 case PyUnicode_1BYTE_KIND:
10446 {
10447 int cmp = memcmp(data1, data2, len);
10448 /* normalize result of memcmp() into the range [-1; 1] */
10449 if (cmp < 0)
10450 return -1;
10451 if (cmp > 0)
10452 return 1;
10453 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010454 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010455 case PyUnicode_2BYTE_KIND:
10456 COMPARE(Py_UCS1, Py_UCS2);
10457 break;
10458 case PyUnicode_4BYTE_KIND:
10459 COMPARE(Py_UCS1, Py_UCS4);
10460 break;
10461 default:
10462 assert(0);
10463 }
10464 break;
10465 }
10466 case PyUnicode_2BYTE_KIND:
10467 {
10468 switch(kind2) {
10469 case PyUnicode_1BYTE_KIND:
10470 COMPARE(Py_UCS2, Py_UCS1);
10471 break;
10472 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010473 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010474 COMPARE(Py_UCS2, Py_UCS2);
10475 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010476 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010477 case PyUnicode_4BYTE_KIND:
10478 COMPARE(Py_UCS2, Py_UCS4);
10479 break;
10480 default:
10481 assert(0);
10482 }
10483 break;
10484 }
10485 case PyUnicode_4BYTE_KIND:
10486 {
10487 switch(kind2) {
10488 case PyUnicode_1BYTE_KIND:
10489 COMPARE(Py_UCS4, Py_UCS1);
10490 break;
10491 case PyUnicode_2BYTE_KIND:
10492 COMPARE(Py_UCS4, Py_UCS2);
10493 break;
10494 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010495 {
10496#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10497 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10498 /* normalize result of wmemcmp() into the range [-1; 1] */
10499 if (cmp < 0)
10500 return -1;
10501 if (cmp > 0)
10502 return 1;
10503#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010504 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010505#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010506 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010507 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010508 default:
10509 assert(0);
10510 }
10511 break;
10512 }
10513 default:
10514 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010515 }
10516
Victor Stinner770e19e2012-10-04 22:59:45 +020010517 if (len1 == len2)
10518 return 0;
10519 if (len1 < len2)
10520 return -1;
10521 else
10522 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010523
10524#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010525}
10526
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010527Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010528unicode_compare_eq(PyObject *str1, PyObject *str2)
10529{
10530 int kind;
10531 void *data1, *data2;
10532 Py_ssize_t len;
10533 int cmp;
10534
Victor Stinnere5567ad2012-10-23 02:48:49 +020010535 len = PyUnicode_GET_LENGTH(str1);
10536 if (PyUnicode_GET_LENGTH(str2) != len)
10537 return 0;
10538 kind = PyUnicode_KIND(str1);
10539 if (PyUnicode_KIND(str2) != kind)
10540 return 0;
10541 data1 = PyUnicode_DATA(str1);
10542 data2 = PyUnicode_DATA(str2);
10543
10544 cmp = memcmp(data1, data2, len * kind);
10545 return (cmp == 0);
10546}
10547
10548
Alexander Belopolsky40018472011-02-26 01:02:56 +000010549int
10550PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10553 if (PyUnicode_READY(left) == -1 ||
10554 PyUnicode_READY(right) == -1)
10555 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010556
10557 /* a string is equal to itself */
10558 if (left == right)
10559 return 0;
10560
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010561 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010563 PyErr_Format(PyExc_TypeError,
10564 "Can't compare %.100s and %.100s",
10565 left->ob_type->tp_name,
10566 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567 return -1;
10568}
10569
Martin v. Löwis5b222132007-06-10 09:51:05 +000010570int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010571_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10572{
10573 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10574 if (right_str == NULL)
10575 return -1;
10576 return PyUnicode_Compare(left, right_str);
10577}
10578
10579int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010580PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 Py_ssize_t i;
10583 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 Py_UCS4 chr;
10585
Victor Stinner910337b2011-10-03 03:20:16 +020010586 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (PyUnicode_READY(uni) == -1)
10588 return -1;
10589 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010590 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010591 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010592 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010593 size_t len, len2 = strlen(str);
10594 int cmp;
10595
10596 len = Py_MIN(len1, len2);
10597 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010598 if (cmp != 0) {
10599 if (cmp < 0)
10600 return -1;
10601 else
10602 return 1;
10603 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010604 if (len1 > len2)
10605 return 1; /* uni is longer */
10606 if (len2 > len1)
10607 return -1; /* str is longer */
10608 return 0;
10609 }
10610 else {
10611 void *data = PyUnicode_DATA(uni);
10612 /* Compare Unicode string and source character set string */
10613 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10614 if (chr != str[i])
10615 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10616 /* This check keeps Python strings that end in '\0' from comparing equal
10617 to C strings identical up to that point. */
10618 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10619 return 1; /* uni is longer */
10620 if (str[i])
10621 return -1; /* str is longer */
10622 return 0;
10623 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010624}
10625
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010626
Benjamin Peterson29060642009-01-31 22:14:21 +000010627#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010628 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010629
Alexander Belopolsky40018472011-02-26 01:02:56 +000010630PyObject *
10631PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010632{
10633 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010634 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010635
Victor Stinnere5567ad2012-10-23 02:48:49 +020010636 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10637 Py_RETURN_NOTIMPLEMENTED;
10638
10639 if (PyUnicode_READY(left) == -1 ||
10640 PyUnicode_READY(right) == -1)
10641 return NULL;
10642
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010643 if (left == right) {
10644 switch (op) {
10645 case Py_EQ:
10646 case Py_LE:
10647 case Py_GE:
10648 /* a string is equal to itself */
10649 v = Py_True;
10650 break;
10651 case Py_NE:
10652 case Py_LT:
10653 case Py_GT:
10654 v = Py_False;
10655 break;
10656 default:
10657 PyErr_BadArgument();
10658 return NULL;
10659 }
10660 }
10661 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010662 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010663 result ^= (op == Py_NE);
10664 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010665 }
10666 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010667 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010668
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010669 /* Convert the return value to a Boolean */
10670 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010671 case Py_LE:
10672 v = TEST_COND(result <= 0);
10673 break;
10674 case Py_GE:
10675 v = TEST_COND(result >= 0);
10676 break;
10677 case Py_LT:
10678 v = TEST_COND(result == -1);
10679 break;
10680 case Py_GT:
10681 v = TEST_COND(result == 1);
10682 break;
10683 default:
10684 PyErr_BadArgument();
10685 return NULL;
10686 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010687 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010688 Py_INCREF(v);
10689 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010690}
10691
Alexander Belopolsky40018472011-02-26 01:02:56 +000010692int
10693PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010694{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010696 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 void *buf1, *buf2;
10698 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010699 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010700
10701 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010702 sub = PyUnicode_FromObject(element);
10703 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010704 PyErr_Format(PyExc_TypeError,
10705 "'in <string>' requires string as left operand, not %s",
10706 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010707 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010708 }
10709
Thomas Wouters477c8d52006-05-27 19:21:47 +000010710 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010711 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010712 Py_DECREF(sub);
10713 return -1;
10714 }
10715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 kind1 = PyUnicode_KIND(str);
10717 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 buf1 = PyUnicode_DATA(str);
10719 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010720 if (kind2 != kind1) {
10721 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010722 Py_DECREF(sub);
10723 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010724 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010725 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010726 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (!buf2) {
10729 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010730 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 return -1;
10732 }
10733 len1 = PyUnicode_GET_LENGTH(str);
10734 len2 = PyUnicode_GET_LENGTH(sub);
10735
Victor Stinner77282cb2013-04-14 19:22:47 +020010736 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 case PyUnicode_1BYTE_KIND:
10738 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10739 break;
10740 case PyUnicode_2BYTE_KIND:
10741 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10742 break;
10743 case PyUnicode_4BYTE_KIND:
10744 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10745 break;
10746 default:
10747 result = -1;
10748 assert(0);
10749 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010750
10751 Py_DECREF(str);
10752 Py_DECREF(sub);
10753
Victor Stinner77282cb2013-04-14 19:22:47 +020010754 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 PyMem_Free(buf2);
10756
Guido van Rossum403d68b2000-03-13 15:55:09 +000010757 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010758}
10759
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760/* Concat to string or Unicode object giving a new Unicode object. */
10761
Alexander Belopolsky40018472011-02-26 01:02:56 +000010762PyObject *
10763PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010766 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010767 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768
10769 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010772 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010775 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776
10777 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010778 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010779 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010782 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 }
10786
Victor Stinner488fa492011-12-12 00:01:39 +010010787 u_len = PyUnicode_GET_LENGTH(u);
10788 v_len = PyUnicode_GET_LENGTH(v);
10789 if (u_len > PY_SSIZE_T_MAX - v_len) {
10790 PyErr_SetString(PyExc_OverflowError,
10791 "strings are too large to concat");
10792 goto onError;
10793 }
10794 new_len = u_len + v_len;
10795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010797 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010798 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010801 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010803 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010804 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10805 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806 Py_DECREF(u);
10807 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010808 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810
Benjamin Peterson29060642009-01-31 22:14:21 +000010811 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812 Py_XDECREF(u);
10813 Py_XDECREF(v);
10814 return NULL;
10815}
10816
Walter Dörwald1ab83302007-05-18 17:15:44 +000010817void
Victor Stinner23e56682011-10-03 03:54:37 +020010818PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010819{
Victor Stinner23e56682011-10-03 03:54:37 +020010820 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010821 Py_UCS4 maxchar, maxchar2;
10822 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010823
10824 if (p_left == NULL) {
10825 if (!PyErr_Occurred())
10826 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010827 return;
10828 }
Victor Stinner23e56682011-10-03 03:54:37 +020010829 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020010830 if (right == NULL || left == NULL
10831 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010832 if (!PyErr_Occurred())
10833 PyErr_BadInternalCall();
10834 goto error;
10835 }
10836
Benjamin Petersonbac79492012-01-14 13:34:47 -050010837 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010838 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010839 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010840 goto error;
10841
Victor Stinner488fa492011-12-12 00:01:39 +010010842 /* Shortcuts */
10843 if (left == unicode_empty) {
10844 Py_DECREF(left);
10845 Py_INCREF(right);
10846 *p_left = right;
10847 return;
10848 }
10849 if (right == unicode_empty)
10850 return;
10851
10852 left_len = PyUnicode_GET_LENGTH(left);
10853 right_len = PyUnicode_GET_LENGTH(right);
10854 if (left_len > PY_SSIZE_T_MAX - right_len) {
10855 PyErr_SetString(PyExc_OverflowError,
10856 "strings are too large to concat");
10857 goto error;
10858 }
10859 new_len = left_len + right_len;
10860
10861 if (unicode_modifiable(left)
10862 && PyUnicode_CheckExact(right)
10863 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010864 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10865 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010866 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010867 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010868 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10869 {
10870 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010871 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010010872 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020010873
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010874 /* copy 'right' into the newly allocated area of 'left' */
10875 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010876 }
Victor Stinner488fa492011-12-12 00:01:39 +010010877 else {
10878 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10879 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010880 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010881
Victor Stinner488fa492011-12-12 00:01:39 +010010882 /* Concat the two Unicode strings */
10883 res = PyUnicode_New(new_len, maxchar);
10884 if (res == NULL)
10885 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010886 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10887 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010888 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010889 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010010890 }
10891 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010892 return;
10893
10894error:
Victor Stinner488fa492011-12-12 00:01:39 +010010895 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010896}
10897
10898void
10899PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10900{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010901 PyUnicode_Append(pleft, right);
10902 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010903}
10904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010905PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010908Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010909string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010910interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911
10912static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010913unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010915 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010916 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010917 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 int kind1, kind2, kind;
10920 void *buf1, *buf2;
10921 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922
Jesus Ceaac451502011-04-20 17:09:23 +020010923 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10924 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010925 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 kind1 = PyUnicode_KIND(self);
10928 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020010929 if (kind2 > kind1) {
10930 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010931 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020010932 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010933 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 buf1 = PyUnicode_DATA(self);
10935 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010937 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 if (!buf2) {
10939 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 return NULL;
10941 }
10942 len1 = PyUnicode_GET_LENGTH(self);
10943 len2 = PyUnicode_GET_LENGTH(substring);
10944
10945 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010946 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 case PyUnicode_1BYTE_KIND:
10948 iresult = ucs1lib_count(
10949 ((Py_UCS1*)buf1) + start, end - start,
10950 buf2, len2, PY_SSIZE_T_MAX
10951 );
10952 break;
10953 case PyUnicode_2BYTE_KIND:
10954 iresult = ucs2lib_count(
10955 ((Py_UCS2*)buf1) + start, end - start,
10956 buf2, len2, PY_SSIZE_T_MAX
10957 );
10958 break;
10959 case PyUnicode_4BYTE_KIND:
10960 iresult = ucs4lib_count(
10961 ((Py_UCS4*)buf1) + start, end - start,
10962 buf2, len2, PY_SSIZE_T_MAX
10963 );
10964 break;
10965 default:
10966 assert(0); iresult = 0;
10967 }
10968
10969 result = PyLong_FromSsize_t(iresult);
10970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 if (kind2 != kind)
10972 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
10974 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010975
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976 return result;
10977}
10978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010979PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010980 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010982Encode S using the codec registered for encoding. Default encoding\n\
10983is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010984handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010985a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10986'xmlcharrefreplace' as well as any other name registered with\n\
10987codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988
10989static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010990unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010992 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993 char *encoding = NULL;
10994 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010995
Benjamin Peterson308d6372009-09-18 21:42:35 +000010996 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10997 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010999 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011000}
11001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011002PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004\n\
11005Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011006If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
11008static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011009unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011011 Py_ssize_t i, j, line_pos, src_len, incr;
11012 Py_UCS4 ch;
11013 PyObject *u;
11014 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011016 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011017 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018
11019 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021
Antoine Pitrou22425222011-10-04 19:10:51 +020011022 if (PyUnicode_READY(self) == -1)
11023 return NULL;
11024
Thomas Wouters7e474022000-07-16 12:04:32 +000011025 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011026 src_len = PyUnicode_GET_LENGTH(self);
11027 i = j = line_pos = 0;
11028 kind = PyUnicode_KIND(self);
11029 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011030 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011031 for (; i < src_len; i++) {
11032 ch = PyUnicode_READ(kind, src_data, i);
11033 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011034 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011036 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011037 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011038 goto overflow;
11039 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011041 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011044 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011045 goto overflow;
11046 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011048 if (ch == '\n' || ch == '\r')
11049 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011051 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011052 if (!found)
11053 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011054
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011056 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 if (!u)
11058 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011059 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060
Antoine Pitroue71d5742011-10-04 15:55:09 +020011061 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062
Antoine Pitroue71d5742011-10-04 15:55:09 +020011063 for (; i < src_len; i++) {
11064 ch = PyUnicode_READ(kind, src_data, i);
11065 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011066 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011067 incr = tabsize - (line_pos % tabsize);
11068 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011069 FILL(kind, dest_data, ' ', j, incr);
11070 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011071 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011072 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011073 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011074 line_pos++;
11075 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011076 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011077 if (ch == '\n' || ch == '\r')
11078 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011080 }
11081 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011082 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011083
Antoine Pitroue71d5742011-10-04 15:55:09 +020011084 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011085 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087}
11088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011089PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011090 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091\n\
11092Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011093such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094arguments start and end are interpreted as in slice notation.\n\
11095\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011096Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097
11098static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011101 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011102 Py_ssize_t start;
11103 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011104 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105
Jesus Ceaac451502011-04-20 17:09:23 +020011106 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11107 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
Christian Heimesd47802e2013-06-29 21:33:36 +020011110 if (PyUnicode_READY(self) == -1) {
11111 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011113 }
11114 if (PyUnicode_READY(substring) == -1) {
11115 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118
Victor Stinner7931d9a2011-11-04 00:22:48 +010011119 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120
11121 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 if (result == -2)
11124 return NULL;
11125
Christian Heimes217cfd12007-12-02 14:31:20 +000011126 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127}
11128
11129static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011130unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011132 void *data;
11133 enum PyUnicode_Kind kind;
11134 Py_UCS4 ch;
11135 PyObject *res;
11136
11137 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11138 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011140 }
11141 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11142 PyErr_SetString(PyExc_IndexError, "string index out of range");
11143 return NULL;
11144 }
11145 kind = PyUnicode_KIND(self);
11146 data = PyUnicode_DATA(self);
11147 ch = PyUnicode_READ(kind, data, index);
11148 if (ch < 256)
11149 return get_latin1_char(ch);
11150
11151 res = PyUnicode_New(1, ch);
11152 if (res == NULL)
11153 return NULL;
11154 kind = PyUnicode_KIND(res);
11155 data = PyUnicode_DATA(res);
11156 PyUnicode_WRITE(kind, data, 0, ch);
11157 assert(_PyUnicode_CheckConsistency(res, 1));
11158 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159}
11160
Guido van Rossumc2504932007-09-18 19:42:40 +000011161/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011162 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011163static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011164unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165{
Guido van Rossumc2504932007-09-18 19:42:40 +000011166 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011167 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011168
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011169#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011170 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011171#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (_PyUnicode_HASH(self) != -1)
11173 return _PyUnicode_HASH(self);
11174 if (PyUnicode_READY(self) == -1)
11175 return -1;
11176 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011177 /*
11178 We make the hash of the empty string be 0, rather than using
11179 (prefix ^ suffix), since this slightly obfuscates the hash secret
11180 */
11181 if (len == 0) {
11182 _PyUnicode_HASH(self) = 0;
11183 return 0;
11184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185
11186 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011187#define HASH(P) \
11188 x ^= (Py_uhash_t) *P << 7; \
11189 while (--len >= 0) \
11190 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191
Georg Brandl2fb477c2012-02-21 00:33:36 +010011192 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 switch (PyUnicode_KIND(self)) {
11194 case PyUnicode_1BYTE_KIND: {
11195 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11196 HASH(c);
11197 break;
11198 }
11199 case PyUnicode_2BYTE_KIND: {
11200 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11201 HASH(s);
11202 break;
11203 }
11204 default: {
11205 Py_UCS4 *l;
11206 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11207 "Impossible switch case in unicode_hash");
11208 l = PyUnicode_4BYTE_DATA(self);
11209 HASH(l);
11210 break;
11211 }
11212 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011213 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11214 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215
Guido van Rossumc2504932007-09-18 19:42:40 +000011216 if (x == -1)
11217 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011219 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011223PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
11228static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011231 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011232 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011233 Py_ssize_t start;
11234 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
Jesus Ceaac451502011-04-20 17:09:23 +020011236 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11237 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239
Christian Heimesd47a0452013-06-29 21:21:37 +020011240 if (PyUnicode_READY(self) == -1) {
11241 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011243 }
11244 if (PyUnicode_READY(substring) == -1) {
11245 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248
Victor Stinner7931d9a2011-11-04 00:22:48 +010011249 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250
11251 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 if (result == -2)
11254 return NULL;
11255
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 if (result < 0) {
11257 PyErr_SetString(PyExc_ValueError, "substring not found");
11258 return NULL;
11259 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011260
Christian Heimes217cfd12007-12-02 14:31:20 +000011261 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262}
11263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011264PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011267Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269
11270static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011271unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 Py_ssize_t i, length;
11274 int kind;
11275 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 int cased;
11277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 if (PyUnicode_READY(self) == -1)
11279 return NULL;
11280 length = PyUnicode_GET_LENGTH(self);
11281 kind = PyUnicode_KIND(self);
11282 data = PyUnicode_DATA(self);
11283
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 if (length == 1)
11286 return PyBool_FromLong(
11287 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011289 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011292
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 for (i = 0; i < length; i++) {
11295 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011296
Benjamin Peterson29060642009-01-31 22:14:21 +000011297 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11298 return PyBool_FromLong(0);
11299 else if (!cased && Py_UNICODE_ISLOWER(ch))
11300 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011302 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303}
11304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011308Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011309at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310
11311static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011312unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 Py_ssize_t i, length;
11315 int kind;
11316 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 int cased;
11318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (PyUnicode_READY(self) == -1)
11320 return NULL;
11321 length = PyUnicode_GET_LENGTH(self);
11322 kind = PyUnicode_KIND(self);
11323 data = PyUnicode_DATA(self);
11324
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 if (length == 1)
11327 return PyBool_FromLong(
11328 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011330 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011333
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 for (i = 0; i < length; i++) {
11336 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011337
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11339 return PyBool_FromLong(0);
11340 else if (!cased && Py_UNICODE_ISUPPER(ch))
11341 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011343 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344}
11345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011346PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011349Return True if S is a titlecased string and there is at least one\n\
11350character in S, i.e. upper- and titlecase characters may only\n\
11351follow uncased characters and lowercase characters only cased ones.\n\
11352Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353
11354static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011355unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 Py_ssize_t i, length;
11358 int kind;
11359 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360 int cased, previous_is_cased;
11361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 if (PyUnicode_READY(self) == -1)
11363 return NULL;
11364 length = PyUnicode_GET_LENGTH(self);
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
11367
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 1) {
11370 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11371 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11372 (Py_UNICODE_ISUPPER(ch) != 0));
11373 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011375 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011377 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011378
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379 cased = 0;
11380 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 for (i = 0; i < length; i++) {
11382 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011383
Benjamin Peterson29060642009-01-31 22:14:21 +000011384 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11385 if (previous_is_cased)
11386 return PyBool_FromLong(0);
11387 previous_is_cased = 1;
11388 cased = 1;
11389 }
11390 else if (Py_UNICODE_ISLOWER(ch)) {
11391 if (!previous_is_cased)
11392 return PyBool_FromLong(0);
11393 previous_is_cased = 1;
11394 cased = 1;
11395 }
11396 else
11397 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011399 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400}
11401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011402PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011405Return True if all characters in S are whitespace\n\
11406and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
11408static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011409unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 Py_ssize_t i, length;
11412 int kind;
11413 void *data;
11414
11415 if (PyUnicode_READY(self) == -1)
11416 return NULL;
11417 length = PyUnicode_GET_LENGTH(self);
11418 kind = PyUnicode_KIND(self);
11419 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 if (length == 1)
11423 return PyBool_FromLong(
11424 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011426 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011428 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 for (i = 0; i < length; i++) {
11431 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011432 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011435 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436}
11437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011438PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011440\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011441Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011442and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011443
11444static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011445unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 Py_ssize_t i, length;
11448 int kind;
11449 void *data;
11450
11451 if (PyUnicode_READY(self) == -1)
11452 return NULL;
11453 length = PyUnicode_GET_LENGTH(self);
11454 kind = PyUnicode_KIND(self);
11455 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011456
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011457 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (length == 1)
11459 return PyBool_FromLong(
11460 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011461
11462 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 for (i = 0; i < length; i++) {
11467 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011468 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011469 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011470 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011471}
11472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011473PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011475\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011476Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011477and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011478
11479static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011480unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 int kind;
11483 void *data;
11484 Py_ssize_t len, i;
11485
11486 if (PyUnicode_READY(self) == -1)
11487 return NULL;
11488
11489 kind = PyUnicode_KIND(self);
11490 data = PyUnicode_DATA(self);
11491 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011492
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011493 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 if (len == 1) {
11495 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11496 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11497 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011498
11499 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 for (i = 0; i < len; i++) {
11504 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011505 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011507 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011508 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011509}
11510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011511PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011514Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011515False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
11517static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011518unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 Py_ssize_t i, length;
11521 int kind;
11522 void *data;
11523
11524 if (PyUnicode_READY(self) == -1)
11525 return NULL;
11526 length = PyUnicode_GET_LENGTH(self);
11527 kind = PyUnicode_KIND(self);
11528 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 if (length == 1)
11532 return PyBool_FromLong(
11533 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011535 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 for (i = 0; i < length; i++) {
11540 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011541 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011543 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544}
11545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011546PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011549Return True if all characters in S are digits\n\
11550and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
11552static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011553unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 Py_ssize_t i, length;
11556 int kind;
11557 void *data;
11558
11559 if (PyUnicode_READY(self) == -1)
11560 return NULL;
11561 length = PyUnicode_GET_LENGTH(self);
11562 kind = PyUnicode_KIND(self);
11563 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 if (length == 1) {
11567 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11568 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011571 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011573 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 for (i = 0; i < length; i++) {
11576 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011579 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580}
11581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011582PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011585Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011586False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
11588static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011589unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 Py_ssize_t i, length;
11592 int kind;
11593 void *data;
11594
11595 if (PyUnicode_READY(self) == -1)
11596 return NULL;
11597 length = PyUnicode_GET_LENGTH(self);
11598 kind = PyUnicode_KIND(self);
11599 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 if (length == 1)
11603 return PyBool_FromLong(
11604 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011606 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011608 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 for (i = 0; i < length; i++) {
11611 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011614 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615}
11616
Martin v. Löwis47383402007-08-15 07:32:56 +000011617int
11618PyUnicode_IsIdentifier(PyObject *self)
11619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 int kind;
11621 void *data;
11622 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011623 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if (PyUnicode_READY(self) == -1) {
11626 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 }
11629
11630 /* Special case for empty strings */
11631 if (PyUnicode_GET_LENGTH(self) == 0)
11632 return 0;
11633 kind = PyUnicode_KIND(self);
11634 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011635
11636 /* PEP 3131 says that the first character must be in
11637 XID_Start and subsequent characters in XID_Continue,
11638 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011639 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011640 letters, digits, underscore). However, given the current
11641 definition of XID_Start and XID_Continue, it is sufficient
11642 to check just for these, except that _ must be allowed
11643 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011645 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011646 return 0;
11647
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011648 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011651 return 1;
11652}
11653
11654PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011656\n\
11657Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011658to the language definition.\n\
11659\n\
11660Use keyword.iskeyword() to test for reserved identifiers\n\
11661such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011662
11663static PyObject*
11664unicode_isidentifier(PyObject *self)
11665{
11666 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11667}
11668
Georg Brandl559e5d72008-06-11 18:37:52 +000011669PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011670 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011671\n\
11672Return True if all characters in S are considered\n\
11673printable in repr() or S is empty, False otherwise.");
11674
11675static PyObject*
11676unicode_isprintable(PyObject *self)
11677{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 Py_ssize_t i, length;
11679 int kind;
11680 void *data;
11681
11682 if (PyUnicode_READY(self) == -1)
11683 return NULL;
11684 length = PyUnicode_GET_LENGTH(self);
11685 kind = PyUnicode_KIND(self);
11686 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011687
11688 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 if (length == 1)
11690 return PyBool_FromLong(
11691 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 for (i = 0; i < length; i++) {
11694 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011695 Py_RETURN_FALSE;
11696 }
11697 }
11698 Py_RETURN_TRUE;
11699}
11700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011701PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011702 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703\n\
11704Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011705iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706
11707static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011708unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011710 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711}
11712
Martin v. Löwis18e16552006-02-15 17:27:45 +000011713static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011714unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 if (PyUnicode_READY(self) == -1)
11717 return -1;
11718 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719}
11720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011721PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011724Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011725done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
11727static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011728unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011730 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 Py_UCS4 fillchar = ' ';
11732
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011733 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734 return NULL;
11735
Benjamin Petersonbac79492012-01-14 13:34:47 -050011736 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
Victor Stinnerc4b49542011-12-11 22:44:26 +010011739 if (PyUnicode_GET_LENGTH(self) >= width)
11740 return unicode_result_unchanged(self);
11741
11742 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743}
11744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011745PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011748Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
11750static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011751unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011753 if (PyUnicode_READY(self) == -1)
11754 return NULL;
11755 if (PyUnicode_IS_ASCII(self))
11756 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011757 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758}
11759
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011760#define LEFTSTRIP 0
11761#define RIGHTSTRIP 1
11762#define BOTHSTRIP 2
11763
11764/* Arrays indexed by above */
11765static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11766
11767#define STRIPNAME(i) (stripformat[i]+3)
11768
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011769/* externally visible for str.strip(unicode) */
11770PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011771_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 void *data;
11774 int kind;
11775 Py_ssize_t i, j, len;
11776 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011777 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11780 return NULL;
11781
11782 kind = PyUnicode_KIND(self);
11783 data = PyUnicode_DATA(self);
11784 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011785 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11787 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011788 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011789
Benjamin Peterson14339b62009-01-31 16:36:08 +000011790 i = 0;
11791 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011792 while (i < len) {
11793 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11794 if (!BLOOM(sepmask, ch))
11795 break;
11796 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11797 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 i++;
11799 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011801
Benjamin Peterson14339b62009-01-31 16:36:08 +000011802 j = len;
11803 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011804 j--;
11805 while (j >= i) {
11806 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11807 if (!BLOOM(sepmask, ch))
11808 break;
11809 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11810 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011812 }
11813
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011816
Victor Stinner7931d9a2011-11-04 00:22:48 +010011817 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818}
11819
11820PyObject*
11821PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11822{
11823 unsigned char *data;
11824 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011825 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826
Victor Stinnerde636f32011-10-01 03:55:54 +020011827 if (PyUnicode_READY(self) == -1)
11828 return NULL;
11829
Victor Stinner684d5fd2012-05-03 02:32:34 +020011830 length = PyUnicode_GET_LENGTH(self);
11831 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011832
Victor Stinner684d5fd2012-05-03 02:32:34 +020011833 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011834 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835
Victor Stinnerde636f32011-10-01 03:55:54 +020011836 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011837 PyErr_SetString(PyExc_IndexError, "string index out of range");
11838 return NULL;
11839 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011840 if (start >= length || end < start)
11841 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011842
Victor Stinner684d5fd2012-05-03 02:32:34 +020011843 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011844 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011845 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011846 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011847 }
11848 else {
11849 kind = PyUnicode_KIND(self);
11850 data = PyUnicode_1BYTE_DATA(self);
11851 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011852 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011853 length);
11854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856
11857static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011858do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 Py_ssize_t len, i, j;
11861
11862 if (PyUnicode_READY(self) == -1)
11863 return NULL;
11864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011866
Victor Stinnercc7af722013-04-09 22:39:24 +020011867 if (PyUnicode_IS_ASCII(self)) {
11868 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11869
11870 i = 0;
11871 if (striptype != RIGHTSTRIP) {
11872 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011873 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020011874 if (!_Py_ascii_whitespace[ch])
11875 break;
11876 i++;
11877 }
11878 }
11879
11880 j = len;
11881 if (striptype != LEFTSTRIP) {
11882 j--;
11883 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011884 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020011885 if (!_Py_ascii_whitespace[ch])
11886 break;
11887 j--;
11888 }
11889 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011890 }
11891 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011892 else {
11893 int kind = PyUnicode_KIND(self);
11894 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011895
Victor Stinnercc7af722013-04-09 22:39:24 +020011896 i = 0;
11897 if (striptype != RIGHTSTRIP) {
11898 while (i < len) {
11899 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11900 if (!Py_UNICODE_ISSPACE(ch))
11901 break;
11902 i++;
11903 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011904 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011905
11906 j = len;
11907 if (striptype != LEFTSTRIP) {
11908 j--;
11909 while (j >= i) {
11910 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11911 if (!Py_UNICODE_ISSPACE(ch))
11912 break;
11913 j--;
11914 }
11915 j++;
11916 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011917 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011918
Victor Stinner7931d9a2011-11-04 00:22:48 +010011919 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920}
11921
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011922
11923static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011924do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011925{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011926 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011927
Serhiy Storchakac6792272013-10-19 21:03:34 +030011928 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011929 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011930
Benjamin Peterson14339b62009-01-31 16:36:08 +000011931 if (sep != NULL && sep != Py_None) {
11932 if (PyUnicode_Check(sep))
11933 return _PyUnicode_XStrip(self, striptype, sep);
11934 else {
11935 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 "%s arg must be None or str",
11937 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011938 return NULL;
11939 }
11940 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011941
Benjamin Peterson14339b62009-01-31 16:36:08 +000011942 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011943}
11944
11945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011946PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011948\n\
11949Return a copy of the string S with leading and trailing\n\
11950whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011951If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011952
11953static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011954unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011955{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011956 if (PyTuple_GET_SIZE(args) == 0)
11957 return do_strip(self, BOTHSTRIP); /* Common case */
11958 else
11959 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011960}
11961
11962
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011963PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011965\n\
11966Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011967If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011968
11969static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011970unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011971{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011972 if (PyTuple_GET_SIZE(args) == 0)
11973 return do_strip(self, LEFTSTRIP); /* Common case */
11974 else
11975 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011976}
11977
11978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011979PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011981\n\
11982Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011983If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011984
11985static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011986unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011987{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011988 if (PyTuple_GET_SIZE(args) == 0)
11989 return do_strip(self, RIGHTSTRIP); /* Common case */
11990 else
11991 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011992}
11993
11994
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011996unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011998 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
Serhiy Storchaka05997252013-01-26 12:14:02 +020012001 if (len < 1)
12002 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
Victor Stinnerc4b49542011-12-11 22:44:26 +010012004 /* no repeat, return original string */
12005 if (len == 1)
12006 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012007
Benjamin Petersonbac79492012-01-14 13:34:47 -050012008 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 return NULL;
12010
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012011 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012012 PyErr_SetString(PyExc_OverflowError,
12013 "repeated string is too long");
12014 return NULL;
12015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012017
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012018 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019 if (!u)
12020 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012021 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 if (PyUnicode_GET_LENGTH(str) == 1) {
12024 const int kind = PyUnicode_KIND(str);
12025 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012026 if (kind == PyUnicode_1BYTE_KIND) {
12027 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012028 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012029 }
12030 else if (kind == PyUnicode_2BYTE_KIND) {
12031 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012032 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012033 ucs2[n] = fill_char;
12034 } else {
12035 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12036 assert(kind == PyUnicode_4BYTE_KIND);
12037 for (n = 0; n < len; ++n)
12038 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 }
12041 else {
12042 /* number of characters copied this far */
12043 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012044 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 char *to = (char *) PyUnicode_DATA(u);
12046 Py_MEMCPY(to, PyUnicode_DATA(str),
12047 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 n = (done <= nchars-done) ? done : nchars-done;
12050 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012051 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053 }
12054
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012055 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012056 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057}
12058
Alexander Belopolsky40018472011-02-26 01:02:56 +000012059PyObject *
12060PyUnicode_Replace(PyObject *obj,
12061 PyObject *subobj,
12062 PyObject *replobj,
12063 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064{
12065 PyObject *self;
12066 PyObject *str1;
12067 PyObject *str2;
12068 PyObject *result;
12069
12070 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012071 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012074 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 Py_DECREF(self);
12076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077 }
12078 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012079 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 Py_DECREF(self);
12081 Py_DECREF(str1);
12082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012084 if (PyUnicode_READY(self) == -1 ||
12085 PyUnicode_READY(str1) == -1 ||
12086 PyUnicode_READY(str2) == -1)
12087 result = NULL;
12088 else
12089 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090 Py_DECREF(self);
12091 Py_DECREF(str1);
12092 Py_DECREF(str2);
12093 return result;
12094}
12095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012096PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012097 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098\n\
12099Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012100old replaced by new. If the optional argument count is\n\
12101given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
12103static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 PyObject *str1;
12107 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012108 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109 PyObject *result;
12110
Martin v. Löwis18e16552006-02-15 17:27:45 +000012111 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012113 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012116 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 return NULL;
12118 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012119 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 Py_DECREF(str1);
12121 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012122 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012123 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12124 result = NULL;
12125 else
12126 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
12128 Py_DECREF(str1);
12129 Py_DECREF(str2);
12130 return result;
12131}
12132
Alexander Belopolsky40018472011-02-26 01:02:56 +000012133static PyObject *
12134unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012136 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 Py_ssize_t isize;
12138 Py_ssize_t osize, squote, dquote, i, o;
12139 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012140 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012144 return NULL;
12145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 isize = PyUnicode_GET_LENGTH(unicode);
12147 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 /* Compute length of output, quote characters, and
12150 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012151 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 max = 127;
12153 squote = dquote = 0;
12154 ikind = PyUnicode_KIND(unicode);
12155 for (i = 0; i < isize; i++) {
12156 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12157 switch (ch) {
12158 case '\'': squote++; osize++; break;
12159 case '"': dquote++; osize++; break;
12160 case '\\': case '\t': case '\r': case '\n':
12161 osize += 2; break;
12162 default:
12163 /* Fast-path ASCII */
12164 if (ch < ' ' || ch == 0x7f)
12165 osize += 4; /* \xHH */
12166 else if (ch < 0x7f)
12167 osize++;
12168 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12169 osize++;
12170 max = ch > max ? ch : max;
12171 }
12172 else if (ch < 0x100)
12173 osize += 4; /* \xHH */
12174 else if (ch < 0x10000)
12175 osize += 6; /* \uHHHH */
12176 else
12177 osize += 10; /* \uHHHHHHHH */
12178 }
12179 }
12180
12181 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012182 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012184 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 if (dquote)
12186 /* Both squote and dquote present. Use squote,
12187 and escape them */
12188 osize += squote;
12189 else
12190 quote = '"';
12191 }
Victor Stinner55c08782013-04-14 18:45:39 +020012192 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193
12194 repr = PyUnicode_New(osize, max);
12195 if (repr == NULL)
12196 return NULL;
12197 okind = PyUnicode_KIND(repr);
12198 odata = PyUnicode_DATA(repr);
12199
12200 PyUnicode_WRITE(okind, odata, 0, quote);
12201 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012202 if (unchanged) {
12203 _PyUnicode_FastCopyCharacters(repr, 1,
12204 unicode, 0,
12205 isize);
12206 }
12207 else {
12208 for (i = 0, o = 1; i < isize; i++) {
12209 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210
Victor Stinner55c08782013-04-14 18:45:39 +020012211 /* Escape quotes and backslashes */
12212 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012213 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012215 continue;
12216 }
12217
12218 /* Map special whitespace to '\t', \n', '\r' */
12219 if (ch == '\t') {
12220 PyUnicode_WRITE(okind, odata, o++, '\\');
12221 PyUnicode_WRITE(okind, odata, o++, 't');
12222 }
12223 else if (ch == '\n') {
12224 PyUnicode_WRITE(okind, odata, o++, '\\');
12225 PyUnicode_WRITE(okind, odata, o++, 'n');
12226 }
12227 else if (ch == '\r') {
12228 PyUnicode_WRITE(okind, odata, o++, '\\');
12229 PyUnicode_WRITE(okind, odata, o++, 'r');
12230 }
12231
12232 /* Map non-printable US ASCII to '\xhh' */
12233 else if (ch < ' ' || ch == 0x7F) {
12234 PyUnicode_WRITE(okind, odata, o++, '\\');
12235 PyUnicode_WRITE(okind, odata, o++, 'x');
12236 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12237 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12238 }
12239
12240 /* Copy ASCII characters as-is */
12241 else if (ch < 0x7F) {
12242 PyUnicode_WRITE(okind, odata, o++, ch);
12243 }
12244
12245 /* Non-ASCII characters */
12246 else {
12247 /* Map Unicode whitespace and control characters
12248 (categories Z* and C* except ASCII space)
12249 */
12250 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12251 PyUnicode_WRITE(okind, odata, o++, '\\');
12252 /* Map 8-bit characters to '\xhh' */
12253 if (ch <= 0xff) {
12254 PyUnicode_WRITE(okind, odata, o++, 'x');
12255 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12256 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12257 }
12258 /* Map 16-bit characters to '\uxxxx' */
12259 else if (ch <= 0xffff) {
12260 PyUnicode_WRITE(okind, odata, o++, 'u');
12261 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12262 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12263 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12264 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12265 }
12266 /* Map 21-bit characters to '\U00xxxxxx' */
12267 else {
12268 PyUnicode_WRITE(okind, odata, o++, 'U');
12269 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12270 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12271 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12272 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12273 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12274 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12275 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12276 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12277 }
12278 }
12279 /* Copy characters as-is */
12280 else {
12281 PyUnicode_WRITE(okind, odata, o++, ch);
12282 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012283 }
12284 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012287 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012288 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289}
12290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012291PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293\n\
12294Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012295such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296arguments start and end are interpreted as in slice notation.\n\
12297\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012298Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299
12300static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012303 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012304 Py_ssize_t start;
12305 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
Jesus Ceaac451502011-04-20 17:09:23 +020012308 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12309 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311
Christian Heimesea71a522013-06-29 21:17:34 +020012312 if (PyUnicode_READY(self) == -1) {
12313 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012315 }
12316 if (PyUnicode_READY(substring) == -1) {
12317 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320
Victor Stinner7931d9a2011-11-04 00:22:48 +010012321 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322
12323 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 if (result == -2)
12326 return NULL;
12327
Christian Heimes217cfd12007-12-02 14:31:20 +000012328 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329}
12330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012331PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012334Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335
12336static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012339 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012340 Py_ssize_t start;
12341 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012342 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343
Jesus Ceaac451502011-04-20 17:09:23 +020012344 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12345 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347
Christian Heimesea71a522013-06-29 21:17:34 +020012348 if (PyUnicode_READY(self) == -1) {
12349 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012351 }
12352 if (PyUnicode_READY(substring) == -1) {
12353 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356
Victor Stinner7931d9a2011-11-04 00:22:48 +010012357 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
12359 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 if (result == -2)
12362 return NULL;
12363
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 if (result < 0) {
12365 PyErr_SetString(PyExc_ValueError, "substring not found");
12366 return NULL;
12367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368
Christian Heimes217cfd12007-12-02 14:31:20 +000012369 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370}
12371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012372PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012373 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012375Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012376done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
12378static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012379unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012381 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 Py_UCS4 fillchar = ' ';
12383
Victor Stinnere9a29352011-10-01 02:14:59 +020012384 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012386
Benjamin Petersonbac79492012-01-14 13:34:47 -050012387 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388 return NULL;
12389
Victor Stinnerc4b49542011-12-11 22:44:26 +010012390 if (PyUnicode_GET_LENGTH(self) >= width)
12391 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392
Victor Stinnerc4b49542011-12-11 22:44:26 +010012393 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394}
12395
Alexander Belopolsky40018472011-02-26 01:02:56 +000012396PyObject *
12397PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012398{
12399 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012400
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401 s = PyUnicode_FromObject(s);
12402 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012403 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 if (sep != NULL) {
12405 sep = PyUnicode_FromObject(sep);
12406 if (sep == NULL) {
12407 Py_DECREF(s);
12408 return NULL;
12409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410 }
12411
Victor Stinner9310abb2011-10-05 00:59:23 +020012412 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413
12414 Py_DECREF(s);
12415 Py_XDECREF(sep);
12416 return result;
12417}
12418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012419PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012420 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421\n\
12422Return a list of the words in S, using sep as the\n\
12423delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012424splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012425whitespace string is a separator and empty strings are\n\
12426removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427
12428static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012429unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012431 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012433 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012435 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12436 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437 return NULL;
12438
12439 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012442 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012444 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445}
12446
Thomas Wouters477c8d52006-05-27 19:21:47 +000012447PyObject *
12448PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12449{
12450 PyObject* str_obj;
12451 PyObject* sep_obj;
12452 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 int kind1, kind2, kind;
12454 void *buf1 = NULL, *buf2 = NULL;
12455 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456
12457 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012458 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012459 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012460 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012461 if (!sep_obj) {
12462 Py_DECREF(str_obj);
12463 return NULL;
12464 }
12465 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12466 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012467 Py_DECREF(str_obj);
12468 return NULL;
12469 }
12470
Victor Stinner14f8f022011-10-05 20:58:25 +020012471 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012473 kind = Py_MAX(kind1, kind2);
12474 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012476 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 if (!buf1)
12478 goto onError;
12479 buf2 = PyUnicode_DATA(sep_obj);
12480 if (kind2 != kind)
12481 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12482 if (!buf2)
12483 goto onError;
12484 len1 = PyUnicode_GET_LENGTH(str_obj);
12485 len2 = PyUnicode_GET_LENGTH(sep_obj);
12486
Benjamin Petersonead6b532011-12-20 17:23:42 -060012487 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012489 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12490 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12491 else
12492 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 break;
12494 case PyUnicode_2BYTE_KIND:
12495 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12496 break;
12497 case PyUnicode_4BYTE_KIND:
12498 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12499 break;
12500 default:
12501 assert(0);
12502 out = 0;
12503 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012504
12505 Py_DECREF(sep_obj);
12506 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 if (kind1 != kind)
12508 PyMem_Free(buf1);
12509 if (kind2 != kind)
12510 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012511
12512 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 onError:
12514 Py_DECREF(sep_obj);
12515 Py_DECREF(str_obj);
12516 if (kind1 != kind && buf1)
12517 PyMem_Free(buf1);
12518 if (kind2 != kind && buf2)
12519 PyMem_Free(buf2);
12520 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012521}
12522
12523
12524PyObject *
12525PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12526{
12527 PyObject* str_obj;
12528 PyObject* sep_obj;
12529 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 int kind1, kind2, kind;
12531 void *buf1 = NULL, *buf2 = NULL;
12532 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012533
12534 str_obj = PyUnicode_FromObject(str_in);
12535 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012537 sep_obj = PyUnicode_FromObject(sep_in);
12538 if (!sep_obj) {
12539 Py_DECREF(str_obj);
12540 return NULL;
12541 }
12542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 kind1 = PyUnicode_KIND(str_in);
12544 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012545 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 buf1 = PyUnicode_DATA(str_in);
12547 if (kind1 != kind)
12548 buf1 = _PyUnicode_AsKind(str_in, kind);
12549 if (!buf1)
12550 goto onError;
12551 buf2 = PyUnicode_DATA(sep_obj);
12552 if (kind2 != kind)
12553 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12554 if (!buf2)
12555 goto onError;
12556 len1 = PyUnicode_GET_LENGTH(str_obj);
12557 len2 = PyUnicode_GET_LENGTH(sep_obj);
12558
Benjamin Petersonead6b532011-12-20 17:23:42 -060012559 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012561 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12562 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12563 else
12564 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 break;
12566 case PyUnicode_2BYTE_KIND:
12567 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12568 break;
12569 case PyUnicode_4BYTE_KIND:
12570 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12571 break;
12572 default:
12573 assert(0);
12574 out = 0;
12575 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012576
12577 Py_DECREF(sep_obj);
12578 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 if (kind1 != kind)
12580 PyMem_Free(buf1);
12581 if (kind2 != kind)
12582 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012583
12584 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 onError:
12586 Py_DECREF(sep_obj);
12587 Py_DECREF(str_obj);
12588 if (kind1 != kind && buf1)
12589 PyMem_Free(buf1);
12590 if (kind2 != kind && buf2)
12591 PyMem_Free(buf2);
12592 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012593}
12594
12595PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012596 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012597\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012598Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012599the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012600found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012601
12602static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012603unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012604{
Victor Stinner9310abb2011-10-05 00:59:23 +020012605 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012606}
12607
12608PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012609 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012610\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012611Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012612the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012613separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012614
12615static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012616unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012617{
Victor Stinner9310abb2011-10-05 00:59:23 +020012618 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012619}
12620
Alexander Belopolsky40018472011-02-26 01:02:56 +000012621PyObject *
12622PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012623{
12624 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012625
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012626 s = PyUnicode_FromObject(s);
12627 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012628 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012629 if (sep != NULL) {
12630 sep = PyUnicode_FromObject(sep);
12631 if (sep == NULL) {
12632 Py_DECREF(s);
12633 return NULL;
12634 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012635 }
12636
Victor Stinner9310abb2011-10-05 00:59:23 +020012637 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012638
12639 Py_DECREF(s);
12640 Py_XDECREF(sep);
12641 return result;
12642}
12643
12644PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012645 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012646\n\
12647Return a list of the words in S, using sep as the\n\
12648delimiter string, starting at the end of the string and\n\
12649working to the front. If maxsplit is given, at most maxsplit\n\
12650splits are done. If sep is not specified, any whitespace string\n\
12651is a separator.");
12652
12653static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012654unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012655{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012656 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012657 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012658 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012659
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012660 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12661 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012662 return NULL;
12663
12664 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012666 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012667 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012668 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012669 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012670}
12671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012672PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012673 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674\n\
12675Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012676Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012677is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678
12679static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012680unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012682 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012683 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012685 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12686 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687 return NULL;
12688
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012689 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690}
12691
12692static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012693PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012695 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696}
12697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012698PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700\n\
12701Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012702and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703
12704static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012705unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012707 if (PyUnicode_READY(self) == -1)
12708 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012709 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710}
12711
Larry Hastings31826802013-10-19 00:09:25 -070012712/*[clinic]
12713module str
Georg Brandlceee0772007-11-27 23:48:05 +000012714
Larry Hastings31826802013-10-19 00:09:25 -070012715@staticmethod
12716str.maketrans as unicode_maketrans
12717
12718 x: object
12719
12720 y: unicode=NULL
12721
12722 z: unicode=NULL
12723
12724 /
12725
12726Return a translation table usable for str.translate().
12727
12728If there is only one argument, it must be a dictionary mapping Unicode
12729ordinals (integers) or characters to Unicode ordinals, strings or None.
12730Character keys will be then converted to ordinals.
12731If there are two arguments, they must be strings of equal length, and
12732in the resulting dictionary, each character in x will be mapped to the
12733character at the same position in y. If there is a third argument, it
12734must be a string, whose characters will be mapped to None in the result.
12735[clinic]*/
12736
12737PyDoc_STRVAR(unicode_maketrans__doc__,
12738"Return a translation table usable for str.translate().\n"
12739"\n"
12740"str.maketrans(x, y=None, z=None)\n"
12741"\n"
12742"If there is only one argument, it must be a dictionary mapping Unicode\n"
12743"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12744"Character keys will be then converted to ordinals.\n"
12745"If there are two arguments, they must be strings of equal length, and\n"
12746"in the resulting dictionary, each character in x will be mapped to the\n"
12747"character at the same position in y. If there is a third argument, it\n"
12748"must be a string, whose characters will be mapped to None in the result.");
12749
12750#define UNICODE_MAKETRANS_METHODDEF \
12751 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12752
12753static PyObject *
12754unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
12755
12756static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012757unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012758{
Larry Hastings31826802013-10-19 00:09:25 -070012759 PyObject *return_value = NULL;
12760 PyObject *x;
12761 PyObject *y = NULL;
12762 PyObject *z = NULL;
12763
12764 if (!PyArg_ParseTuple(args,
12765 "O|UU:maketrans",
12766 &x, &y, &z))
12767 goto exit;
12768 return_value = unicode_maketrans_impl(x, y, z);
12769
12770exit:
12771 return return_value;
12772}
12773
12774static PyObject *
12775unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12776/*[clinic checksum: 137db9c3199e7906b7967009f511c24fa3235b5f]*/
12777{
Georg Brandlceee0772007-11-27 23:48:05 +000012778 PyObject *new = NULL, *key, *value;
12779 Py_ssize_t i = 0;
12780 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781
Georg Brandlceee0772007-11-27 23:48:05 +000012782 new = PyDict_New();
12783 if (!new)
12784 return NULL;
12785 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 int x_kind, y_kind, z_kind;
12787 void *x_data, *y_data, *z_data;
12788
Georg Brandlceee0772007-11-27 23:48:05 +000012789 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012790 if (!PyUnicode_Check(x)) {
12791 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12792 "be a string if there is a second argument");
12793 goto err;
12794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012796 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12797 "arguments must have equal length");
12798 goto err;
12799 }
12800 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801 x_kind = PyUnicode_KIND(x);
12802 y_kind = PyUnicode_KIND(y);
12803 x_data = PyUnicode_DATA(x);
12804 y_data = PyUnicode_DATA(y);
12805 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12806 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012807 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012808 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012809 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012810 if (!value) {
12811 Py_DECREF(key);
12812 goto err;
12813 }
Georg Brandlceee0772007-11-27 23:48:05 +000012814 res = PyDict_SetItem(new, key, value);
12815 Py_DECREF(key);
12816 Py_DECREF(value);
12817 if (res < 0)
12818 goto err;
12819 }
12820 /* create entries for deleting chars in z */
12821 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 z_kind = PyUnicode_KIND(z);
12823 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012824 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012826 if (!key)
12827 goto err;
12828 res = PyDict_SetItem(new, key, Py_None);
12829 Py_DECREF(key);
12830 if (res < 0)
12831 goto err;
12832 }
12833 }
12834 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 int kind;
12836 void *data;
12837
Georg Brandlceee0772007-11-27 23:48:05 +000012838 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012839 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012840 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12841 "to maketrans it must be a dict");
12842 goto err;
12843 }
12844 /* copy entries into the new dict, converting string keys to int keys */
12845 while (PyDict_Next(x, &i, &key, &value)) {
12846 if (PyUnicode_Check(key)) {
12847 /* convert string keys to integer keys */
12848 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012849 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012850 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12851 "table must be of length 1");
12852 goto err;
12853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 kind = PyUnicode_KIND(key);
12855 data = PyUnicode_DATA(key);
12856 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012857 if (!newkey)
12858 goto err;
12859 res = PyDict_SetItem(new, newkey, value);
12860 Py_DECREF(newkey);
12861 if (res < 0)
12862 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012863 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012864 /* just keep integer keys */
12865 if (PyDict_SetItem(new, key, value) < 0)
12866 goto err;
12867 } else {
12868 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12869 "be strings or integers");
12870 goto err;
12871 }
12872 }
12873 }
12874 return new;
12875 err:
12876 Py_DECREF(new);
12877 return NULL;
12878}
12879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012880PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012881 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882\n\
12883Return a copy of the string S, where all characters have been mapped\n\
12884through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012885Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012886Unmapped characters are left untouched. Characters mapped to None\n\
12887are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888
12889static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893}
12894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012895PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012896 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012898Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899
12900static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012901unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012903 if (PyUnicode_READY(self) == -1)
12904 return NULL;
12905 if (PyUnicode_IS_ASCII(self))
12906 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012907 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908}
12909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012910PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012911 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012913Pad a numeric string S with zeros on the left, to fill a field\n\
12914of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915
12916static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012917unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012919 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012920 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012921 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922 int kind;
12923 void *data;
12924 Py_UCS4 chr;
12925
Martin v. Löwis18e16552006-02-15 17:27:45 +000012926 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927 return NULL;
12928
Benjamin Petersonbac79492012-01-14 13:34:47 -050012929 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012931
Victor Stinnerc4b49542011-12-11 22:44:26 +010012932 if (PyUnicode_GET_LENGTH(self) >= width)
12933 return unicode_result_unchanged(self);
12934
12935 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012936
12937 u = pad(self, fill, 0, '0');
12938
Walter Dörwald068325e2002-04-15 13:36:47 +000012939 if (u == NULL)
12940 return NULL;
12941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 kind = PyUnicode_KIND(u);
12943 data = PyUnicode_DATA(u);
12944 chr = PyUnicode_READ(kind, data, fill);
12945
12946 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 PyUnicode_WRITE(kind, data, 0, chr);
12949 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950 }
12951
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012952 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012953 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955
12956#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012957static PyObject *
12958unicode__decimal2ascii(PyObject *self)
12959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012961}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962#endif
12963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012964PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012965 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012966\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012967Return True if S starts with the specified prefix, False otherwise.\n\
12968With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012969With optional end, stop comparing S at that position.\n\
12970prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971
12972static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012973unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012974 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012975{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012976 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012977 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012978 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012979 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012980 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012981
Jesus Ceaac451502011-04-20 17:09:23 +020012982 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012983 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012984 if (PyTuple_Check(subobj)) {
12985 Py_ssize_t i;
12986 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012987 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012988 if (substring == NULL)
12989 return NULL;
12990 result = tailmatch(self, substring, start, end, -1);
12991 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012992 if (result == -1)
12993 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012994 if (result) {
12995 Py_RETURN_TRUE;
12996 }
12997 }
12998 /* nothing matched */
12999 Py_RETURN_FALSE;
13000 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013001 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013002 if (substring == NULL) {
13003 if (PyErr_ExceptionMatches(PyExc_TypeError))
13004 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13005 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013007 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013008 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013009 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013010 if (result == -1)
13011 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013012 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013013}
13014
13015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013016PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013017 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013019Return True if S ends with the specified suffix, False otherwise.\n\
13020With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013021With optional end, stop comparing S at that position.\n\
13022suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023
13024static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013025unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013026 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013028 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013029 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013030 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013031 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013032 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013033
Jesus Ceaac451502011-04-20 17:09:23 +020013034 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013035 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013036 if (PyTuple_Check(subobj)) {
13037 Py_ssize_t i;
13038 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013039 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013040 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013041 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013043 result = tailmatch(self, substring, start, end, +1);
13044 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013045 if (result == -1)
13046 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013047 if (result) {
13048 Py_RETURN_TRUE;
13049 }
13050 }
13051 Py_RETURN_FALSE;
13052 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013053 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013054 if (substring == NULL) {
13055 if (PyErr_ExceptionMatches(PyExc_TypeError))
13056 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13057 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013059 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013060 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013061 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013062 if (result == -1)
13063 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013064 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065}
13066
Victor Stinner202fdca2012-05-07 12:47:02 +020013067Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013068_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013069{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013070 if (!writer->readonly)
13071 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13072 else {
13073 /* Copy-on-write mode: set buffer size to 0 so
13074 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13075 * next write. */
13076 writer->size = 0;
13077 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013078 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13079 writer->data = PyUnicode_DATA(writer->buffer);
13080 writer->kind = PyUnicode_KIND(writer->buffer);
13081}
13082
Victor Stinnerd3f08822012-05-29 12:57:52 +020013083void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013084_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013085{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013086 memset(writer, 0, sizeof(*writer));
13087#ifdef Py_DEBUG
13088 writer->kind = 5; /* invalid kind */
13089#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013090 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013091}
13092
Victor Stinnerd3f08822012-05-29 12:57:52 +020013093int
13094_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13095 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013096{
13097 Py_ssize_t newlen;
13098 PyObject *newbuffer;
13099
Victor Stinnerd3f08822012-05-29 12:57:52 +020013100 assert(length > 0);
13101
Victor Stinner202fdca2012-05-07 12:47:02 +020013102 if (length > PY_SSIZE_T_MAX - writer->pos) {
13103 PyErr_NoMemory();
13104 return -1;
13105 }
13106 newlen = writer->pos + length;
13107
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013108 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013109
Victor Stinnerd3f08822012-05-29 12:57:52 +020013110 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013111 assert(!writer->readonly);
13112 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013113 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013114 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013115 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013116 if (newlen < writer->min_length)
13117 newlen = writer->min_length;
13118
Victor Stinnerd3f08822012-05-29 12:57:52 +020013119 writer->buffer = PyUnicode_New(newlen, maxchar);
13120 if (writer->buffer == NULL)
13121 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013122 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013123 else if (newlen > writer->size) {
13124 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013125 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013126 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013127 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013128 if (newlen < writer->min_length)
13129 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013130
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013131 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013132 /* resize + widen */
13133 newbuffer = PyUnicode_New(newlen, maxchar);
13134 if (newbuffer == NULL)
13135 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013136 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13137 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013138 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013139 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013140 }
13141 else {
13142 newbuffer = resize_compact(writer->buffer, newlen);
13143 if (newbuffer == NULL)
13144 return -1;
13145 }
13146 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013147 }
13148 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013149 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013150 newbuffer = PyUnicode_New(writer->size, maxchar);
13151 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013152 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013153 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13154 writer->buffer, 0, writer->pos);
13155 Py_DECREF(writer->buffer);
13156 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013157 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013158 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013159 return 0;
13160}
13161
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013162Py_LOCAL_INLINE(int)
13163_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013164{
13165 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13166 return -1;
13167 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13168 writer->pos++;
13169 return 0;
13170}
13171
13172int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013173_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13174{
13175 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13176}
13177
13178int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013179_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13180{
13181 Py_UCS4 maxchar;
13182 Py_ssize_t len;
13183
13184 if (PyUnicode_READY(str) == -1)
13185 return -1;
13186 len = PyUnicode_GET_LENGTH(str);
13187 if (len == 0)
13188 return 0;
13189 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13190 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013191 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013192 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013193 Py_INCREF(str);
13194 writer->buffer = str;
13195 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013196 writer->pos += len;
13197 return 0;
13198 }
13199 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13200 return -1;
13201 }
13202 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13203 str, 0, len);
13204 writer->pos += len;
13205 return 0;
13206}
13207
Victor Stinnere215d962012-10-06 23:03:36 +020013208int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013209_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13210 Py_ssize_t start, Py_ssize_t end)
13211{
13212 Py_UCS4 maxchar;
13213 Py_ssize_t len;
13214
13215 if (PyUnicode_READY(str) == -1)
13216 return -1;
13217
13218 assert(0 <= start);
13219 assert(end <= PyUnicode_GET_LENGTH(str));
13220 assert(start <= end);
13221
13222 if (end == 0)
13223 return 0;
13224
13225 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13226 return _PyUnicodeWriter_WriteStr(writer, str);
13227
13228 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13229 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13230 else
13231 maxchar = writer->maxchar;
13232 len = end - start;
13233
13234 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13235 return -1;
13236
13237 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13238 str, start, len);
13239 writer->pos += len;
13240 return 0;
13241}
13242
13243int
Victor Stinnere215d962012-10-06 23:03:36 +020013244_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13245{
13246 Py_UCS4 maxchar;
13247
13248 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13249 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13250 return -1;
13251 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13252 writer->pos += len;
13253 return 0;
13254}
13255
Victor Stinnerd3f08822012-05-29 12:57:52 +020013256PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013257_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013258{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013259 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013260 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013261 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013262 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013263 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013264 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013265 str = writer->buffer;
13266 writer->buffer = NULL;
13267 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13268 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013269 }
13270 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13271 PyObject *newbuffer;
13272 newbuffer = resize_compact(writer->buffer, writer->pos);
13273 if (newbuffer == NULL) {
13274 Py_DECREF(writer->buffer);
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013275 writer->buffer = NULL;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013276 return NULL;
13277 }
13278 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013279 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013280 str = writer->buffer;
13281 writer->buffer = NULL;
13282 assert(_PyUnicode_CheckConsistency(str, 1));
13283 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013284}
13285
Victor Stinnerd3f08822012-05-29 12:57:52 +020013286void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013287_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013288{
13289 Py_CLEAR(writer->buffer);
13290}
13291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013293
13294PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013296\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013297Return a formatted version of S, using substitutions from args and kwargs.\n\
13298The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013299
Eric Smith27bbca62010-11-04 17:06:58 +000013300PyDoc_STRVAR(format_map__doc__,
13301 "S.format_map(mapping) -> str\n\
13302\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013303Return a formatted version of S, using substitutions from mapping.\n\
13304The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013305
Eric Smith4a7d76d2008-05-30 18:10:19 +000013306static PyObject *
13307unicode__format__(PyObject* self, PyObject* args)
13308{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013309 PyObject *format_spec;
13310 _PyUnicodeWriter writer;
13311 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013312
13313 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13314 return NULL;
13315
Victor Stinnerd3f08822012-05-29 12:57:52 +020013316 if (PyUnicode_READY(self) == -1)
13317 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013318 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013319 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13320 self, format_spec, 0,
13321 PyUnicode_GET_LENGTH(format_spec));
13322 if (ret == -1) {
13323 _PyUnicodeWriter_Dealloc(&writer);
13324 return NULL;
13325 }
13326 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013327}
13328
Eric Smith8c663262007-08-25 02:26:07 +000013329PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013330 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013331\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013332Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013333
13334static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013335unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013337 Py_ssize_t size;
13338
13339 /* If it's a compact object, account for base structure +
13340 character data. */
13341 if (PyUnicode_IS_COMPACT_ASCII(v))
13342 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13343 else if (PyUnicode_IS_COMPACT(v))
13344 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013345 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 else {
13347 /* If it is a two-block object, account for base object, and
13348 for character block if present. */
13349 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013350 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013351 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013352 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013353 }
13354 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013355 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013356 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013357 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013358 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013359 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013360
13361 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013362}
13363
13364PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013365 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013366
13367static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013368unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013369{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013370 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 if (!copy)
13372 return NULL;
13373 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013374}
13375
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013377 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013378 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013379 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13380 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013381 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13382 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013383 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013384 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13385 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13386 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13387 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13388 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013389 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013390 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13391 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13392 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013393 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013394 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13395 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13396 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013397 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013398 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013399 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013400 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013401 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13402 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13403 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13404 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13405 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13406 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13407 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13408 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13409 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13410 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13411 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13412 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13413 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13414 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013415 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013416 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013417 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013418 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013419 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013420 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013421 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013422 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013423#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013424 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013425 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426#endif
13427
Benjamin Peterson14339b62009-01-31 16:36:08 +000013428 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013429 {NULL, NULL}
13430};
13431
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013432static PyObject *
13433unicode_mod(PyObject *v, PyObject *w)
13434{
Brian Curtindfc80e32011-08-10 20:28:54 -050013435 if (!PyUnicode_Check(v))
13436 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013438}
13439
13440static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013441 0, /*nb_add*/
13442 0, /*nb_subtract*/
13443 0, /*nb_multiply*/
13444 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013445};
13446
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013448 (lenfunc) unicode_length, /* sq_length */
13449 PyUnicode_Concat, /* sq_concat */
13450 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13451 (ssizeargfunc) unicode_getitem, /* sq_item */
13452 0, /* sq_slice */
13453 0, /* sq_ass_item */
13454 0, /* sq_ass_slice */
13455 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013456};
13457
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013458static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013459unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013460{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013461 if (PyUnicode_READY(self) == -1)
13462 return NULL;
13463
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013464 if (PyIndex_Check(item)) {
13465 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013466 if (i == -1 && PyErr_Occurred())
13467 return NULL;
13468 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013469 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013470 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013471 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013472 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013473 PyObject *result;
13474 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013475 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013476 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013478 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013479 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013480 return NULL;
13481 }
13482
13483 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013484 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013485 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013486 slicelength == PyUnicode_GET_LENGTH(self)) {
13487 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013488 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013489 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013490 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013491 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013492 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013493 src_kind = PyUnicode_KIND(self);
13494 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013495 if (!PyUnicode_IS_ASCII(self)) {
13496 kind_limit = kind_maxchar_limit(src_kind);
13497 max_char = 0;
13498 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13499 ch = PyUnicode_READ(src_kind, src_data, cur);
13500 if (ch > max_char) {
13501 max_char = ch;
13502 if (max_char >= kind_limit)
13503 break;
13504 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013505 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013506 }
Victor Stinner55c99112011-10-13 01:17:06 +020013507 else
13508 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013509 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013510 if (result == NULL)
13511 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013512 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013513 dest_data = PyUnicode_DATA(result);
13514
13515 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013516 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13517 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013518 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013519 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013520 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013521 } else {
13522 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13523 return NULL;
13524 }
13525}
13526
13527static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013528 (lenfunc)unicode_length, /* mp_length */
13529 (binaryfunc)unicode_subscript, /* mp_subscript */
13530 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013531};
13532
Guido van Rossumd57fd912000-03-10 22:53:23 +000013533
Guido van Rossumd57fd912000-03-10 22:53:23 +000013534/* Helpers for PyUnicode_Format() */
13535
Victor Stinnera47082312012-10-04 02:19:54 +020013536struct unicode_formatter_t {
13537 PyObject *args;
13538 int args_owned;
13539 Py_ssize_t arglen, argidx;
13540 PyObject *dict;
13541
13542 enum PyUnicode_Kind fmtkind;
13543 Py_ssize_t fmtcnt, fmtpos;
13544 void *fmtdata;
13545 PyObject *fmtstr;
13546
13547 _PyUnicodeWriter writer;
13548};
13549
13550struct unicode_format_arg_t {
13551 Py_UCS4 ch;
13552 int flags;
13553 Py_ssize_t width;
13554 int prec;
13555 int sign;
13556};
13557
Guido van Rossumd57fd912000-03-10 22:53:23 +000013558static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013559unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013560{
Victor Stinnera47082312012-10-04 02:19:54 +020013561 Py_ssize_t argidx = ctx->argidx;
13562
13563 if (argidx < ctx->arglen) {
13564 ctx->argidx++;
13565 if (ctx->arglen < 0)
13566 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 else
Victor Stinnera47082312012-10-04 02:19:54 +020013568 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013569 }
13570 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013572 return NULL;
13573}
13574
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013575/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013576
Victor Stinnera47082312012-10-04 02:19:54 +020013577/* Format a float into the writer if the writer is not NULL, or into *p_output
13578 otherwise.
13579
13580 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013581static int
Victor Stinnera47082312012-10-04 02:19:54 +020013582formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13583 PyObject **p_output,
13584 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013585{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013586 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013587 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013588 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013589 int prec;
13590 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013591
Guido van Rossumd57fd912000-03-10 22:53:23 +000013592 x = PyFloat_AsDouble(v);
13593 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013594 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013595
Victor Stinnera47082312012-10-04 02:19:54 +020013596 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013597 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013599
Victor Stinnera47082312012-10-04 02:19:54 +020013600 if (arg->flags & F_ALT)
13601 dtoa_flags = Py_DTSF_ALT;
13602 else
13603 dtoa_flags = 0;
13604 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013605 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013606 return -1;
13607 len = strlen(p);
13608 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013609 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13610 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013611 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013612 }
Victor Stinner184252a2012-06-16 02:57:41 +020013613 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013614 writer->pos += len;
13615 }
13616 else
13617 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013618 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013619 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013620}
13621
Victor Stinnerd0880d52012-04-27 23:40:13 +020013622/* formatlong() emulates the format codes d, u, o, x and X, and
13623 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13624 * Python's regular ints.
13625 * Return value: a new PyUnicodeObject*, or NULL if error.
13626 * The output string is of the form
13627 * "-"? ("0x" | "0X")? digit+
13628 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13629 * set in flags. The case of hex digits will be correct,
13630 * There will be at least prec digits, zero-filled on the left if
13631 * necessary to get that many.
13632 * val object to be converted
13633 * flags bitmask of format flags; only F_ALT is looked at
13634 * prec minimum number of digits; 0-fill on left if needed
13635 * type a character in [duoxX]; u acts the same as d
13636 *
13637 * CAUTION: o, x and X conversions on regular ints can never
13638 * produce a '-' sign, but can for Python's unbounded ints.
13639 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013640static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013641formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013642{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013643 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013644 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013645 Py_ssize_t i;
13646 int sign; /* 1 if '-', else 0 */
13647 int len; /* number of characters */
13648 Py_ssize_t llen;
13649 int numdigits; /* len == numnondigits + numdigits */
13650 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013651 int prec = arg->prec;
13652 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013653
Victor Stinnerd0880d52012-04-27 23:40:13 +020013654 /* Avoid exceeding SSIZE_T_MAX */
13655 if (prec > INT_MAX-3) {
13656 PyErr_SetString(PyExc_OverflowError,
13657 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013658 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013659 }
13660
13661 assert(PyLong_Check(val));
13662
13663 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013664 default:
13665 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013666 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013667 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013668 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013669 /* int and int subclasses should print numerically when a numeric */
13670 /* format code is used (see issue18780) */
13671 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013672 break;
13673 case 'o':
13674 numnondigits = 2;
13675 result = PyNumber_ToBase(val, 8);
13676 break;
13677 case 'x':
13678 case 'X':
13679 numnondigits = 2;
13680 result = PyNumber_ToBase(val, 16);
13681 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013682 }
13683 if (!result)
13684 return NULL;
13685
13686 assert(unicode_modifiable(result));
13687 assert(PyUnicode_IS_READY(result));
13688 assert(PyUnicode_IS_ASCII(result));
13689
13690 /* To modify the string in-place, there can only be one reference. */
13691 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013692 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013693 PyErr_BadInternalCall();
13694 return NULL;
13695 }
13696 buf = PyUnicode_DATA(result);
13697 llen = PyUnicode_GET_LENGTH(result);
13698 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013699 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013700 PyErr_SetString(PyExc_ValueError,
13701 "string too large in _PyBytes_FormatLong");
13702 return NULL;
13703 }
13704 len = (int)llen;
13705 sign = buf[0] == '-';
13706 numnondigits += sign;
13707 numdigits = len - numnondigits;
13708 assert(numdigits > 0);
13709
13710 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013711 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013712 (type == 'o' || type == 'x' || type == 'X'))) {
13713 assert(buf[sign] == '0');
13714 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13715 buf[sign+1] == 'o');
13716 numnondigits -= 2;
13717 buf += 2;
13718 len -= 2;
13719 if (sign)
13720 buf[0] = '-';
13721 assert(len == numnondigits + numdigits);
13722 assert(numdigits > 0);
13723 }
13724
13725 /* Fill with leading zeroes to meet minimum width. */
13726 if (prec > numdigits) {
13727 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13728 numnondigits + prec);
13729 char *b1;
13730 if (!r1) {
13731 Py_DECREF(result);
13732 return NULL;
13733 }
13734 b1 = PyBytes_AS_STRING(r1);
13735 for (i = 0; i < numnondigits; ++i)
13736 *b1++ = *buf++;
13737 for (i = 0; i < prec - numdigits; i++)
13738 *b1++ = '0';
13739 for (i = 0; i < numdigits; i++)
13740 *b1++ = *buf++;
13741 *b1 = '\0';
13742 Py_DECREF(result);
13743 result = r1;
13744 buf = PyBytes_AS_STRING(result);
13745 len = numnondigits + prec;
13746 }
13747
13748 /* Fix up case for hex conversions. */
13749 if (type == 'X') {
13750 /* Need to convert all lower case letters to upper case.
13751 and need to convert 0x to 0X (and -0x to -0X). */
13752 for (i = 0; i < len; i++)
13753 if (buf[i] >= 'a' && buf[i] <= 'x')
13754 buf[i] -= 'a'-'A';
13755 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013756 if (!PyUnicode_Check(result)
13757 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013758 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013759 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013760 Py_DECREF(result);
13761 result = unicode;
13762 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013763 else if (len != PyUnicode_GET_LENGTH(result)) {
13764 if (PyUnicode_Resize(&result, len) < 0)
13765 Py_CLEAR(result);
13766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013767 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013768}
13769
Victor Stinner621ef3d2012-10-02 00:33:47 +020013770/* Format an integer.
13771 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013772 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013773 * -1 and raise an exception on error */
13774static int
Victor Stinnera47082312012-10-04 02:19:54 +020013775mainformatlong(PyObject *v,
13776 struct unicode_format_arg_t *arg,
13777 PyObject **p_output,
13778 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013779{
13780 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013781 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013782
13783 if (!PyNumber_Check(v))
13784 goto wrongtype;
13785
13786 if (!PyLong_Check(v)) {
13787 iobj = PyNumber_Long(v);
13788 if (iobj == NULL) {
13789 if (PyErr_ExceptionMatches(PyExc_TypeError))
13790 goto wrongtype;
13791 return -1;
13792 }
13793 assert(PyLong_Check(iobj));
13794 }
13795 else {
13796 iobj = v;
13797 Py_INCREF(iobj);
13798 }
13799
13800 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013801 && arg->width == -1 && arg->prec == -1
13802 && !(arg->flags & (F_SIGN | F_BLANK))
13803 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013804 {
13805 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013806 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013807 int base;
13808
Victor Stinnera47082312012-10-04 02:19:54 +020013809 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013810 {
13811 default:
13812 assert(0 && "'type' not in [diuoxX]");
13813 case 'd':
13814 case 'i':
13815 case 'u':
13816 base = 10;
13817 break;
13818 case 'o':
13819 base = 8;
13820 break;
13821 case 'x':
13822 case 'X':
13823 base = 16;
13824 break;
13825 }
13826
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013827 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13828 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013829 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013830 }
13831 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013832 return 1;
13833 }
13834
Victor Stinnera47082312012-10-04 02:19:54 +020013835 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013836 Py_DECREF(iobj);
13837 if (res == NULL)
13838 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013839 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013840 return 0;
13841
13842wrongtype:
13843 PyErr_Format(PyExc_TypeError,
13844 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013845 "not %.200s",
13846 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013847 return -1;
13848}
13849
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013850static Py_UCS4
13851formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013852{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013853 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013854 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013855 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013856 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013857 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013858 goto onError;
13859 }
13860 else {
13861 /* Integer input truncated to a character */
13862 long x;
13863 x = PyLong_AsLong(v);
13864 if (x == -1 && PyErr_Occurred())
13865 goto onError;
13866
Victor Stinner8faf8212011-12-08 22:14:11 +010013867 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013868 PyErr_SetString(PyExc_OverflowError,
13869 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013870 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 }
13872
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013873 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013874 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013875
Benjamin Peterson29060642009-01-31 22:14:21 +000013876 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013877 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013878 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013879 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013880}
13881
Victor Stinnera47082312012-10-04 02:19:54 +020013882/* Parse options of an argument: flags, width, precision.
13883 Handle also "%(name)" syntax.
13884
13885 Return 0 if the argument has been formatted into arg->str.
13886 Return 1 if the argument has been written into ctx->writer,
13887 Raise an exception and return -1 on error. */
13888static int
13889unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13890 struct unicode_format_arg_t *arg)
13891{
13892#define FORMAT_READ(ctx) \
13893 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13894
13895 PyObject *v;
13896
Victor Stinnera47082312012-10-04 02:19:54 +020013897 if (arg->ch == '(') {
13898 /* Get argument value from a dictionary. Example: "%(name)s". */
13899 Py_ssize_t keystart;
13900 Py_ssize_t keylen;
13901 PyObject *key;
13902 int pcount = 1;
13903
13904 if (ctx->dict == NULL) {
13905 PyErr_SetString(PyExc_TypeError,
13906 "format requires a mapping");
13907 return -1;
13908 }
13909 ++ctx->fmtpos;
13910 --ctx->fmtcnt;
13911 keystart = ctx->fmtpos;
13912 /* Skip over balanced parentheses */
13913 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13914 arg->ch = FORMAT_READ(ctx);
13915 if (arg->ch == ')')
13916 --pcount;
13917 else if (arg->ch == '(')
13918 ++pcount;
13919 ctx->fmtpos++;
13920 }
13921 keylen = ctx->fmtpos - keystart - 1;
13922 if (ctx->fmtcnt < 0 || pcount > 0) {
13923 PyErr_SetString(PyExc_ValueError,
13924 "incomplete format key");
13925 return -1;
13926 }
13927 key = PyUnicode_Substring(ctx->fmtstr,
13928 keystart, keystart + keylen);
13929 if (key == NULL)
13930 return -1;
13931 if (ctx->args_owned) {
13932 Py_DECREF(ctx->args);
13933 ctx->args_owned = 0;
13934 }
13935 ctx->args = PyObject_GetItem(ctx->dict, key);
13936 Py_DECREF(key);
13937 if (ctx->args == NULL)
13938 return -1;
13939 ctx->args_owned = 1;
13940 ctx->arglen = -1;
13941 ctx->argidx = -2;
13942 }
13943
13944 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013945 while (--ctx->fmtcnt >= 0) {
13946 arg->ch = FORMAT_READ(ctx);
13947 ctx->fmtpos++;
13948 switch (arg->ch) {
13949 case '-': arg->flags |= F_LJUST; continue;
13950 case '+': arg->flags |= F_SIGN; continue;
13951 case ' ': arg->flags |= F_BLANK; continue;
13952 case '#': arg->flags |= F_ALT; continue;
13953 case '0': arg->flags |= F_ZERO; continue;
13954 }
13955 break;
13956 }
13957
13958 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013959 if (arg->ch == '*') {
13960 v = unicode_format_getnextarg(ctx);
13961 if (v == NULL)
13962 return -1;
13963 if (!PyLong_Check(v)) {
13964 PyErr_SetString(PyExc_TypeError,
13965 "* wants int");
13966 return -1;
13967 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013968 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013969 if (arg->width == -1 && PyErr_Occurred())
13970 return -1;
13971 if (arg->width < 0) {
13972 arg->flags |= F_LJUST;
13973 arg->width = -arg->width;
13974 }
13975 if (--ctx->fmtcnt >= 0) {
13976 arg->ch = FORMAT_READ(ctx);
13977 ctx->fmtpos++;
13978 }
13979 }
13980 else if (arg->ch >= '0' && arg->ch <= '9') {
13981 arg->width = arg->ch - '0';
13982 while (--ctx->fmtcnt >= 0) {
13983 arg->ch = FORMAT_READ(ctx);
13984 ctx->fmtpos++;
13985 if (arg->ch < '0' || arg->ch > '9')
13986 break;
13987 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13988 mixing signed and unsigned comparison. Since arg->ch is between
13989 '0' and '9', casting to int is safe. */
13990 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13991 PyErr_SetString(PyExc_ValueError,
13992 "width too big");
13993 return -1;
13994 }
13995 arg->width = arg->width*10 + (arg->ch - '0');
13996 }
13997 }
13998
13999 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014000 if (arg->ch == '.') {
14001 arg->prec = 0;
14002 if (--ctx->fmtcnt >= 0) {
14003 arg->ch = FORMAT_READ(ctx);
14004 ctx->fmtpos++;
14005 }
14006 if (arg->ch == '*') {
14007 v = unicode_format_getnextarg(ctx);
14008 if (v == NULL)
14009 return -1;
14010 if (!PyLong_Check(v)) {
14011 PyErr_SetString(PyExc_TypeError,
14012 "* wants int");
14013 return -1;
14014 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014015 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014016 if (arg->prec == -1 && PyErr_Occurred())
14017 return -1;
14018 if (arg->prec < 0)
14019 arg->prec = 0;
14020 if (--ctx->fmtcnt >= 0) {
14021 arg->ch = FORMAT_READ(ctx);
14022 ctx->fmtpos++;
14023 }
14024 }
14025 else if (arg->ch >= '0' && arg->ch <= '9') {
14026 arg->prec = arg->ch - '0';
14027 while (--ctx->fmtcnt >= 0) {
14028 arg->ch = FORMAT_READ(ctx);
14029 ctx->fmtpos++;
14030 if (arg->ch < '0' || arg->ch > '9')
14031 break;
14032 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14033 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014034 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014035 return -1;
14036 }
14037 arg->prec = arg->prec*10 + (arg->ch - '0');
14038 }
14039 }
14040 }
14041
14042 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14043 if (ctx->fmtcnt >= 0) {
14044 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14045 if (--ctx->fmtcnt >= 0) {
14046 arg->ch = FORMAT_READ(ctx);
14047 ctx->fmtpos++;
14048 }
14049 }
14050 }
14051 if (ctx->fmtcnt < 0) {
14052 PyErr_SetString(PyExc_ValueError,
14053 "incomplete format");
14054 return -1;
14055 }
14056 return 0;
14057
14058#undef FORMAT_READ
14059}
14060
14061/* Format one argument. Supported conversion specifiers:
14062
14063 - "s", "r", "a": any type
14064 - "i", "d", "u", "o", "x", "X": int
14065 - "e", "E", "f", "F", "g", "G": float
14066 - "c": int or str (1 character)
14067
Victor Stinner8dbd4212012-12-04 09:30:24 +010014068 When possible, the output is written directly into the Unicode writer
14069 (ctx->writer). A string is created when padding is required.
14070
Victor Stinnera47082312012-10-04 02:19:54 +020014071 Return 0 if the argument has been formatted into *p_str,
14072 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014073 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014074static int
14075unicode_format_arg_format(struct unicode_formatter_t *ctx,
14076 struct unicode_format_arg_t *arg,
14077 PyObject **p_str)
14078{
14079 PyObject *v;
14080 _PyUnicodeWriter *writer = &ctx->writer;
14081
14082 if (ctx->fmtcnt == 0)
14083 ctx->writer.overallocate = 0;
14084
14085 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014086 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014087 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014088 return 1;
14089 }
14090
14091 v = unicode_format_getnextarg(ctx);
14092 if (v == NULL)
14093 return -1;
14094
Victor Stinnera47082312012-10-04 02:19:54 +020014095
14096 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014097 case 's':
14098 case 'r':
14099 case 'a':
14100 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14101 /* Fast path */
14102 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14103 return -1;
14104 return 1;
14105 }
14106
14107 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14108 *p_str = v;
14109 Py_INCREF(*p_str);
14110 }
14111 else {
14112 if (arg->ch == 's')
14113 *p_str = PyObject_Str(v);
14114 else if (arg->ch == 'r')
14115 *p_str = PyObject_Repr(v);
14116 else
14117 *p_str = PyObject_ASCII(v);
14118 }
14119 break;
14120
14121 case 'i':
14122 case 'd':
14123 case 'u':
14124 case 'o':
14125 case 'x':
14126 case 'X':
14127 {
14128 int ret = mainformatlong(v, arg, p_str, writer);
14129 if (ret != 0)
14130 return ret;
14131 arg->sign = 1;
14132 break;
14133 }
14134
14135 case 'e':
14136 case 'E':
14137 case 'f':
14138 case 'F':
14139 case 'g':
14140 case 'G':
14141 if (arg->width == -1 && arg->prec == -1
14142 && !(arg->flags & (F_SIGN | F_BLANK)))
14143 {
14144 /* Fast path */
14145 if (formatfloat(v, arg, NULL, writer) == -1)
14146 return -1;
14147 return 1;
14148 }
14149
14150 arg->sign = 1;
14151 if (formatfloat(v, arg, p_str, NULL) == -1)
14152 return -1;
14153 break;
14154
14155 case 'c':
14156 {
14157 Py_UCS4 ch = formatchar(v);
14158 if (ch == (Py_UCS4) -1)
14159 return -1;
14160 if (arg->width == -1 && arg->prec == -1) {
14161 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014162 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014163 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014164 return 1;
14165 }
14166 *p_str = PyUnicode_FromOrdinal(ch);
14167 break;
14168 }
14169
14170 default:
14171 PyErr_Format(PyExc_ValueError,
14172 "unsupported format character '%c' (0x%x) "
14173 "at index %zd",
14174 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14175 (int)arg->ch,
14176 ctx->fmtpos - 1);
14177 return -1;
14178 }
14179 if (*p_str == NULL)
14180 return -1;
14181 assert (PyUnicode_Check(*p_str));
14182 return 0;
14183}
14184
14185static int
14186unicode_format_arg_output(struct unicode_formatter_t *ctx,
14187 struct unicode_format_arg_t *arg,
14188 PyObject *str)
14189{
14190 Py_ssize_t len;
14191 enum PyUnicode_Kind kind;
14192 void *pbuf;
14193 Py_ssize_t pindex;
14194 Py_UCS4 signchar;
14195 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014196 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014197 Py_ssize_t sublen;
14198 _PyUnicodeWriter *writer = &ctx->writer;
14199 Py_UCS4 fill;
14200
14201 fill = ' ';
14202 if (arg->sign && arg->flags & F_ZERO)
14203 fill = '0';
14204
14205 if (PyUnicode_READY(str) == -1)
14206 return -1;
14207
14208 len = PyUnicode_GET_LENGTH(str);
14209 if ((arg->width == -1 || arg->width <= len)
14210 && (arg->prec == -1 || arg->prec >= len)
14211 && !(arg->flags & (F_SIGN | F_BLANK)))
14212 {
14213 /* Fast path */
14214 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14215 return -1;
14216 return 0;
14217 }
14218
14219 /* Truncate the string for "s", "r" and "a" formats
14220 if the precision is set */
14221 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14222 if (arg->prec >= 0 && len > arg->prec)
14223 len = arg->prec;
14224 }
14225
14226 /* Adjust sign and width */
14227 kind = PyUnicode_KIND(str);
14228 pbuf = PyUnicode_DATA(str);
14229 pindex = 0;
14230 signchar = '\0';
14231 if (arg->sign) {
14232 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14233 if (ch == '-' || ch == '+') {
14234 signchar = ch;
14235 len--;
14236 pindex++;
14237 }
14238 else if (arg->flags & F_SIGN)
14239 signchar = '+';
14240 else if (arg->flags & F_BLANK)
14241 signchar = ' ';
14242 else
14243 arg->sign = 0;
14244 }
14245 if (arg->width < len)
14246 arg->width = len;
14247
14248 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014249 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014250 if (!(arg->flags & F_LJUST)) {
14251 if (arg->sign) {
14252 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014253 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014254 }
14255 else {
14256 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014257 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014258 }
14259 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014260 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14261 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014262 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014263 }
14264
Victor Stinnera47082312012-10-04 02:19:54 +020014265 buflen = arg->width;
14266 if (arg->sign && len == arg->width)
14267 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014268 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014269 return -1;
14270
14271 /* Write the sign if needed */
14272 if (arg->sign) {
14273 if (fill != ' ') {
14274 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14275 writer->pos += 1;
14276 }
14277 if (arg->width > len)
14278 arg->width--;
14279 }
14280
14281 /* Write the numeric prefix for "x", "X" and "o" formats
14282 if the alternate form is used.
14283 For example, write "0x" for the "%#x" format. */
14284 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14285 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14286 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14287 if (fill != ' ') {
14288 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14289 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14290 writer->pos += 2;
14291 pindex += 2;
14292 }
14293 arg->width -= 2;
14294 if (arg->width < 0)
14295 arg->width = 0;
14296 len -= 2;
14297 }
14298
14299 /* Pad left with the fill character if needed */
14300 if (arg->width > len && !(arg->flags & F_LJUST)) {
14301 sublen = arg->width - len;
14302 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14303 writer->pos += sublen;
14304 arg->width = len;
14305 }
14306
14307 /* If padding with spaces: write sign if needed and/or numeric prefix if
14308 the alternate form is used */
14309 if (fill == ' ') {
14310 if (arg->sign) {
14311 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14312 writer->pos += 1;
14313 }
14314 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14315 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14316 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14317 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14318 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14319 writer->pos += 2;
14320 pindex += 2;
14321 }
14322 }
14323
14324 /* Write characters */
14325 if (len) {
14326 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14327 str, pindex, len);
14328 writer->pos += len;
14329 }
14330
14331 /* Pad right with the fill character if needed */
14332 if (arg->width > len) {
14333 sublen = arg->width - len;
14334 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14335 writer->pos += sublen;
14336 }
14337 return 0;
14338}
14339
14340/* Helper of PyUnicode_Format(): format one arg.
14341 Return 0 on success, raise an exception and return -1 on error. */
14342static int
14343unicode_format_arg(struct unicode_formatter_t *ctx)
14344{
14345 struct unicode_format_arg_t arg;
14346 PyObject *str;
14347 int ret;
14348
Victor Stinner8dbd4212012-12-04 09:30:24 +010014349 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14350 arg.flags = 0;
14351 arg.width = -1;
14352 arg.prec = -1;
14353 arg.sign = 0;
14354 str = NULL;
14355
Victor Stinnera47082312012-10-04 02:19:54 +020014356 ret = unicode_format_arg_parse(ctx, &arg);
14357 if (ret == -1)
14358 return -1;
14359
14360 ret = unicode_format_arg_format(ctx, &arg, &str);
14361 if (ret == -1)
14362 return -1;
14363
14364 if (ret != 1) {
14365 ret = unicode_format_arg_output(ctx, &arg, str);
14366 Py_DECREF(str);
14367 if (ret == -1)
14368 return -1;
14369 }
14370
14371 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14372 PyErr_SetString(PyExc_TypeError,
14373 "not all arguments converted during string formatting");
14374 return -1;
14375 }
14376 return 0;
14377}
14378
Alexander Belopolsky40018472011-02-26 01:02:56 +000014379PyObject *
14380PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014381{
Victor Stinnera47082312012-10-04 02:19:54 +020014382 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014383
Guido van Rossumd57fd912000-03-10 22:53:23 +000014384 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014385 PyErr_BadInternalCall();
14386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014387 }
Victor Stinnera47082312012-10-04 02:19:54 +020014388
14389 ctx.fmtstr = PyUnicode_FromObject(format);
14390 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014391 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014392 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14393 Py_DECREF(ctx.fmtstr);
14394 return NULL;
14395 }
14396 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14397 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14398 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14399 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014400
Victor Stinner8f674cc2013-04-17 23:02:17 +020014401 _PyUnicodeWriter_Init(&ctx.writer);
14402 ctx.writer.min_length = ctx.fmtcnt + 100;
14403 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014404
Guido van Rossumd57fd912000-03-10 22:53:23 +000014405 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014406 ctx.arglen = PyTuple_Size(args);
14407 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014408 }
14409 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014410 ctx.arglen = -1;
14411 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014412 }
Victor Stinnera47082312012-10-04 02:19:54 +020014413 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014414 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014415 ctx.dict = args;
14416 else
14417 ctx.dict = NULL;
14418 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014419
Victor Stinnera47082312012-10-04 02:19:54 +020014420 while (--ctx.fmtcnt >= 0) {
14421 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014422 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014423
14424 nonfmtpos = ctx.fmtpos++;
14425 while (ctx.fmtcnt >= 0 &&
14426 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14427 ctx.fmtpos++;
14428 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014429 }
Victor Stinnera47082312012-10-04 02:19:54 +020014430 if (ctx.fmtcnt < 0) {
14431 ctx.fmtpos--;
14432 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014433 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014434
Victor Stinnercfc4c132013-04-03 01:48:39 +020014435 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14436 nonfmtpos, ctx.fmtpos) < 0)
14437 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014438 }
14439 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014440 ctx.fmtpos++;
14441 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014442 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014443 }
14444 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014445
Victor Stinnera47082312012-10-04 02:19:54 +020014446 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014447 PyErr_SetString(PyExc_TypeError,
14448 "not all arguments converted during string formatting");
14449 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014450 }
14451
Victor Stinnera47082312012-10-04 02:19:54 +020014452 if (ctx.args_owned) {
14453 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014454 }
Victor Stinnera47082312012-10-04 02:19:54 +020014455 Py_DECREF(ctx.fmtstr);
14456 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014457
Benjamin Peterson29060642009-01-31 22:14:21 +000014458 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014459 Py_DECREF(ctx.fmtstr);
14460 _PyUnicodeWriter_Dealloc(&ctx.writer);
14461 if (ctx.args_owned) {
14462 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014463 }
14464 return NULL;
14465}
14466
Jeremy Hylton938ace62002-07-17 16:30:39 +000014467static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014468unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14469
Tim Peters6d6c1a32001-08-02 04:15:00 +000014470static PyObject *
14471unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14472{
Benjamin Peterson29060642009-01-31 22:14:21 +000014473 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014474 static char *kwlist[] = {"object", "encoding", "errors", 0};
14475 char *encoding = NULL;
14476 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014477
Benjamin Peterson14339b62009-01-31 16:36:08 +000014478 if (type != &PyUnicode_Type)
14479 return unicode_subtype_new(type, args, kwds);
14480 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014481 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014482 return NULL;
14483 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014484 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014485 if (encoding == NULL && errors == NULL)
14486 return PyObject_Str(x);
14487 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014488 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014489}
14490
Guido van Rossume023fe02001-08-30 03:12:59 +000014491static PyObject *
14492unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14493{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014494 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014495 Py_ssize_t length, char_size;
14496 int share_wstr, share_utf8;
14497 unsigned int kind;
14498 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014499
Benjamin Peterson14339b62009-01-31 16:36:08 +000014500 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014501
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014502 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014503 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014504 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014505 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014506 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014507 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014508 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014509 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014510
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014511 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014512 if (self == NULL) {
14513 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 return NULL;
14515 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014516 kind = PyUnicode_KIND(unicode);
14517 length = PyUnicode_GET_LENGTH(unicode);
14518
14519 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014520#ifdef Py_DEBUG
14521 _PyUnicode_HASH(self) = -1;
14522#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014523 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014524#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014525 _PyUnicode_STATE(self).interned = 0;
14526 _PyUnicode_STATE(self).kind = kind;
14527 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014528 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014529 _PyUnicode_STATE(self).ready = 1;
14530 _PyUnicode_WSTR(self) = NULL;
14531 _PyUnicode_UTF8_LENGTH(self) = 0;
14532 _PyUnicode_UTF8(self) = NULL;
14533 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014534 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014535
14536 share_utf8 = 0;
14537 share_wstr = 0;
14538 if (kind == PyUnicode_1BYTE_KIND) {
14539 char_size = 1;
14540 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14541 share_utf8 = 1;
14542 }
14543 else if (kind == PyUnicode_2BYTE_KIND) {
14544 char_size = 2;
14545 if (sizeof(wchar_t) == 2)
14546 share_wstr = 1;
14547 }
14548 else {
14549 assert(kind == PyUnicode_4BYTE_KIND);
14550 char_size = 4;
14551 if (sizeof(wchar_t) == 4)
14552 share_wstr = 1;
14553 }
14554
14555 /* Ensure we won't overflow the length. */
14556 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14557 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014558 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014559 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014560 data = PyObject_MALLOC((length + 1) * char_size);
14561 if (data == NULL) {
14562 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014563 goto onError;
14564 }
14565
Victor Stinnerc3c74152011-10-02 20:39:55 +020014566 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014567 if (share_utf8) {
14568 _PyUnicode_UTF8_LENGTH(self) = length;
14569 _PyUnicode_UTF8(self) = data;
14570 }
14571 if (share_wstr) {
14572 _PyUnicode_WSTR_LENGTH(self) = length;
14573 _PyUnicode_WSTR(self) = (wchar_t *)data;
14574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014575
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014576 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014577 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014578 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014579#ifdef Py_DEBUG
14580 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14581#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014582 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014583 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014584
14585onError:
14586 Py_DECREF(unicode);
14587 Py_DECREF(self);
14588 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014589}
14590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014591PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014592"str(object='') -> str\n\
14593str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014594\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014595Create a new string object from the given object. If encoding or\n\
14596errors is specified, then the object must expose a data buffer\n\
14597that will be decoded using the given encoding and error handler.\n\
14598Otherwise, returns the result of object.__str__() (if defined)\n\
14599or repr(object).\n\
14600encoding defaults to sys.getdefaultencoding().\n\
14601errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014602
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014603static PyObject *unicode_iter(PyObject *seq);
14604
Guido van Rossumd57fd912000-03-10 22:53:23 +000014605PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014606 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014607 "str", /* tp_name */
14608 sizeof(PyUnicodeObject), /* tp_size */
14609 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014610 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014611 (destructor)unicode_dealloc, /* tp_dealloc */
14612 0, /* tp_print */
14613 0, /* tp_getattr */
14614 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014615 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014616 unicode_repr, /* tp_repr */
14617 &unicode_as_number, /* tp_as_number */
14618 &unicode_as_sequence, /* tp_as_sequence */
14619 &unicode_as_mapping, /* tp_as_mapping */
14620 (hashfunc) unicode_hash, /* tp_hash*/
14621 0, /* tp_call*/
14622 (reprfunc) unicode_str, /* tp_str */
14623 PyObject_GenericGetAttr, /* tp_getattro */
14624 0, /* tp_setattro */
14625 0, /* tp_as_buffer */
14626 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014627 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014628 unicode_doc, /* tp_doc */
14629 0, /* tp_traverse */
14630 0, /* tp_clear */
14631 PyUnicode_RichCompare, /* tp_richcompare */
14632 0, /* tp_weaklistoffset */
14633 unicode_iter, /* tp_iter */
14634 0, /* tp_iternext */
14635 unicode_methods, /* tp_methods */
14636 0, /* tp_members */
14637 0, /* tp_getset */
14638 &PyBaseObject_Type, /* tp_base */
14639 0, /* tp_dict */
14640 0, /* tp_descr_get */
14641 0, /* tp_descr_set */
14642 0, /* tp_dictoffset */
14643 0, /* tp_init */
14644 0, /* tp_alloc */
14645 unicode_new, /* tp_new */
14646 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014647};
14648
14649/* Initialize the Unicode implementation */
14650
Victor Stinner3a50e702011-10-18 21:21:00 +020014651int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014652{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014653 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014654 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014655 0x000A, /* LINE FEED */
14656 0x000D, /* CARRIAGE RETURN */
14657 0x001C, /* FILE SEPARATOR */
14658 0x001D, /* GROUP SEPARATOR */
14659 0x001E, /* RECORD SEPARATOR */
14660 0x0085, /* NEXT LINE */
14661 0x2028, /* LINE SEPARATOR */
14662 0x2029, /* PARAGRAPH SEPARATOR */
14663 };
14664
Fred Drakee4315f52000-05-09 19:53:39 +000014665 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014666 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014667 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014668 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014669 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014670
Guido van Rossumcacfc072002-05-24 19:01:59 +000014671 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014672 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014673
14674 /* initialize the linebreak bloom filter */
14675 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014676 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014677 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014678
Christian Heimes26532f72013-07-20 14:57:16 +020014679 if (PyType_Ready(&EncodingMapType) < 0)
14680 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014681
Benjamin Petersonc4311282012-10-30 23:21:10 -040014682 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14683 Py_FatalError("Can't initialize field name iterator type");
14684
14685 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14686 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014687
Victor Stinner3a50e702011-10-18 21:21:00 +020014688#ifdef HAVE_MBCS
14689 winver.dwOSVersionInfoSize = sizeof(winver);
14690 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14691 PyErr_SetFromWindowsErr(0);
14692 return -1;
14693 }
14694#endif
14695 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014696}
14697
14698/* Finalize the Unicode implementation */
14699
Christian Heimesa156e092008-02-16 07:38:31 +000014700int
14701PyUnicode_ClearFreeList(void)
14702{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014703 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014704}
14705
Guido van Rossumd57fd912000-03-10 22:53:23 +000014706void
Thomas Wouters78890102000-07-22 19:25:51 +000014707_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014708{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014709 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014710
Serhiy Storchaka05997252013-01-26 12:14:02 +020014711 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014712
Serhiy Storchaka05997252013-01-26 12:14:02 +020014713 for (i = 0; i < 256; i++)
14714 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014715 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014716 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014717}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014718
Walter Dörwald16807132007-05-25 13:52:07 +000014719void
14720PyUnicode_InternInPlace(PyObject **p)
14721{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014722 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014723 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014724#ifdef Py_DEBUG
14725 assert(s != NULL);
14726 assert(_PyUnicode_CHECK(s));
14727#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014728 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014729 return;
14730#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014731 /* If it's a subclass, we don't really know what putting
14732 it in the interned dict might do. */
14733 if (!PyUnicode_CheckExact(s))
14734 return;
14735 if (PyUnicode_CHECK_INTERNED(s))
14736 return;
14737 if (interned == NULL) {
14738 interned = PyDict_New();
14739 if (interned == NULL) {
14740 PyErr_Clear(); /* Don't leave an exception */
14741 return;
14742 }
14743 }
14744 /* It might be that the GetItem call fails even
14745 though the key is present in the dictionary,
14746 namely when this happens during a stack overflow. */
14747 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014748 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014749 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014750
Victor Stinnerf0335102013-04-14 19:13:03 +020014751 if (t) {
14752 Py_INCREF(t);
14753 Py_DECREF(*p);
14754 *p = t;
14755 return;
14756 }
Walter Dörwald16807132007-05-25 13:52:07 +000014757
Benjamin Peterson14339b62009-01-31 16:36:08 +000014758 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014759 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014760 PyErr_Clear();
14761 PyThreadState_GET()->recursion_critical = 0;
14762 return;
14763 }
14764 PyThreadState_GET()->recursion_critical = 0;
14765 /* The two references in interned are not counted by refcnt.
14766 The deallocator will take care of this */
14767 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014768 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014769}
14770
14771void
14772PyUnicode_InternImmortal(PyObject **p)
14773{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014774 PyUnicode_InternInPlace(p);
14775 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014776 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014777 Py_INCREF(*p);
14778 }
Walter Dörwald16807132007-05-25 13:52:07 +000014779}
14780
14781PyObject *
14782PyUnicode_InternFromString(const char *cp)
14783{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014784 PyObject *s = PyUnicode_FromString(cp);
14785 if (s == NULL)
14786 return NULL;
14787 PyUnicode_InternInPlace(&s);
14788 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014789}
14790
Alexander Belopolsky40018472011-02-26 01:02:56 +000014791void
14792_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014793{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014794 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014795 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014796 Py_ssize_t i, n;
14797 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014798
Benjamin Peterson14339b62009-01-31 16:36:08 +000014799 if (interned == NULL || !PyDict_Check(interned))
14800 return;
14801 keys = PyDict_Keys(interned);
14802 if (keys == NULL || !PyList_Check(keys)) {
14803 PyErr_Clear();
14804 return;
14805 }
Walter Dörwald16807132007-05-25 13:52:07 +000014806
Benjamin Peterson14339b62009-01-31 16:36:08 +000014807 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14808 detector, interned unicode strings are not forcibly deallocated;
14809 rather, we give them their stolen references back, and then clear
14810 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014811
Benjamin Peterson14339b62009-01-31 16:36:08 +000014812 n = PyList_GET_SIZE(keys);
14813 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014814 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014815 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014816 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014817 if (PyUnicode_READY(s) == -1) {
14818 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014819 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014821 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014822 case SSTATE_NOT_INTERNED:
14823 /* XXX Shouldn't happen */
14824 break;
14825 case SSTATE_INTERNED_IMMORTAL:
14826 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014827 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014828 break;
14829 case SSTATE_INTERNED_MORTAL:
14830 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014831 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014832 break;
14833 default:
14834 Py_FatalError("Inconsistent interned string state.");
14835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014836 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014837 }
14838 fprintf(stderr, "total size of all interned strings: "
14839 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14840 "mortal/immortal\n", mortal_size, immortal_size);
14841 Py_DECREF(keys);
14842 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014843 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014844}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014845
14846
14847/********************* Unicode Iterator **************************/
14848
14849typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014850 PyObject_HEAD
14851 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014852 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014853} unicodeiterobject;
14854
14855static void
14856unicodeiter_dealloc(unicodeiterobject *it)
14857{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014858 _PyObject_GC_UNTRACK(it);
14859 Py_XDECREF(it->it_seq);
14860 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014861}
14862
14863static int
14864unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14865{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014866 Py_VISIT(it->it_seq);
14867 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014868}
14869
14870static PyObject *
14871unicodeiter_next(unicodeiterobject *it)
14872{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014873 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014874
Benjamin Peterson14339b62009-01-31 16:36:08 +000014875 assert(it != NULL);
14876 seq = it->it_seq;
14877 if (seq == NULL)
14878 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014879 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014881 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14882 int kind = PyUnicode_KIND(seq);
14883 void *data = PyUnicode_DATA(seq);
14884 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14885 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014886 if (item != NULL)
14887 ++it->it_index;
14888 return item;
14889 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014890
Benjamin Peterson14339b62009-01-31 16:36:08 +000014891 Py_DECREF(seq);
14892 it->it_seq = NULL;
14893 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014894}
14895
14896static PyObject *
14897unicodeiter_len(unicodeiterobject *it)
14898{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014899 Py_ssize_t len = 0;
14900 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014901 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014902 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014903}
14904
14905PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14906
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014907static PyObject *
14908unicodeiter_reduce(unicodeiterobject *it)
14909{
14910 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014911 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014912 it->it_seq, it->it_index);
14913 } else {
14914 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14915 if (u == NULL)
14916 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014917 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014918 }
14919}
14920
14921PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14922
14923static PyObject *
14924unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14925{
14926 Py_ssize_t index = PyLong_AsSsize_t(state);
14927 if (index == -1 && PyErr_Occurred())
14928 return NULL;
14929 if (index < 0)
14930 index = 0;
14931 it->it_index = index;
14932 Py_RETURN_NONE;
14933}
14934
14935PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14936
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014937static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014938 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014939 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014940 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14941 reduce_doc},
14942 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14943 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014944 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014945};
14946
14947PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014948 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14949 "str_iterator", /* tp_name */
14950 sizeof(unicodeiterobject), /* tp_basicsize */
14951 0, /* tp_itemsize */
14952 /* methods */
14953 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14954 0, /* tp_print */
14955 0, /* tp_getattr */
14956 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014957 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014958 0, /* tp_repr */
14959 0, /* tp_as_number */
14960 0, /* tp_as_sequence */
14961 0, /* tp_as_mapping */
14962 0, /* tp_hash */
14963 0, /* tp_call */
14964 0, /* tp_str */
14965 PyObject_GenericGetAttr, /* tp_getattro */
14966 0, /* tp_setattro */
14967 0, /* tp_as_buffer */
14968 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14969 0, /* tp_doc */
14970 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14971 0, /* tp_clear */
14972 0, /* tp_richcompare */
14973 0, /* tp_weaklistoffset */
14974 PyObject_SelfIter, /* tp_iter */
14975 (iternextfunc)unicodeiter_next, /* tp_iternext */
14976 unicodeiter_methods, /* tp_methods */
14977 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014978};
14979
14980static PyObject *
14981unicode_iter(PyObject *seq)
14982{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014983 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014984
Benjamin Peterson14339b62009-01-31 16:36:08 +000014985 if (!PyUnicode_Check(seq)) {
14986 PyErr_BadInternalCall();
14987 return NULL;
14988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014989 if (PyUnicode_READY(seq) == -1)
14990 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014991 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14992 if (it == NULL)
14993 return NULL;
14994 it->it_index = 0;
14995 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014996 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014997 _PyObject_GC_TRACK(it);
14998 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014999}
15000
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015001
15002size_t
15003Py_UNICODE_strlen(const Py_UNICODE *u)
15004{
15005 int res = 0;
15006 while(*u++)
15007 res++;
15008 return res;
15009}
15010
15011Py_UNICODE*
15012Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15013{
15014 Py_UNICODE *u = s1;
15015 while ((*u++ = *s2++));
15016 return s1;
15017}
15018
15019Py_UNICODE*
15020Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15021{
15022 Py_UNICODE *u = s1;
15023 while ((*u++ = *s2++))
15024 if (n-- == 0)
15025 break;
15026 return s1;
15027}
15028
15029Py_UNICODE*
15030Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15031{
15032 Py_UNICODE *u1 = s1;
15033 u1 += Py_UNICODE_strlen(u1);
15034 Py_UNICODE_strcpy(u1, s2);
15035 return s1;
15036}
15037
15038int
15039Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15040{
15041 while (*s1 && *s2 && *s1 == *s2)
15042 s1++, s2++;
15043 if (*s1 && *s2)
15044 return (*s1 < *s2) ? -1 : +1;
15045 if (*s1)
15046 return 1;
15047 if (*s2)
15048 return -1;
15049 return 0;
15050}
15051
15052int
15053Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15054{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015055 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015056 for (; n != 0; n--) {
15057 u1 = *s1;
15058 u2 = *s2;
15059 if (u1 != u2)
15060 return (u1 < u2) ? -1 : +1;
15061 if (u1 == '\0')
15062 return 0;
15063 s1++;
15064 s2++;
15065 }
15066 return 0;
15067}
15068
15069Py_UNICODE*
15070Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15071{
15072 const Py_UNICODE *p;
15073 for (p = s; *p; p++)
15074 if (*p == c)
15075 return (Py_UNICODE*)p;
15076 return NULL;
15077}
15078
15079Py_UNICODE*
15080Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15081{
15082 const Py_UNICODE *p;
15083 p = s + Py_UNICODE_strlen(s);
15084 while (p != s) {
15085 p--;
15086 if (*p == c)
15087 return (Py_UNICODE*)p;
15088 }
15089 return NULL;
15090}
Victor Stinner331ea922010-08-10 16:37:20 +000015091
Victor Stinner71133ff2010-09-01 23:43:53 +000015092Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015093PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015094{
Victor Stinner577db2c2011-10-11 22:12:48 +020015095 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015096 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015098 if (!PyUnicode_Check(unicode)) {
15099 PyErr_BadArgument();
15100 return NULL;
15101 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015102 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015103 if (u == NULL)
15104 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015105 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015106 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015107 PyErr_NoMemory();
15108 return NULL;
15109 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015110 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015111 size *= sizeof(Py_UNICODE);
15112 copy = PyMem_Malloc(size);
15113 if (copy == NULL) {
15114 PyErr_NoMemory();
15115 return NULL;
15116 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015117 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015118 return copy;
15119}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015120
Georg Brandl66c221e2010-10-14 07:04:07 +000015121/* A _string module, to export formatter_parser and formatter_field_name_split
15122 to the string.Formatter class implemented in Python. */
15123
15124static PyMethodDef _string_methods[] = {
15125 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15126 METH_O, PyDoc_STR("split the argument as a field name")},
15127 {"formatter_parser", (PyCFunction) formatter_parser,
15128 METH_O, PyDoc_STR("parse the argument as a format string")},
15129 {NULL, NULL}
15130};
15131
15132static struct PyModuleDef _string_module = {
15133 PyModuleDef_HEAD_INIT,
15134 "_string",
15135 PyDoc_STR("string helper module"),
15136 0,
15137 _string_methods,
15138 NULL,
15139 NULL,
15140 NULL,
15141 NULL
15142};
15143
15144PyMODINIT_FUNC
15145PyInit__string(void)
15146{
15147 return PyModule_Create(&_string_module);
15148}
15149
15150
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015151#ifdef __cplusplus
15152}
15153#endif