blob: ff806cf1a115db4fab00c6525c726bae11137484 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinner910337b2011-10-03 03:20:16 +0200107#undef PyUnicode_READY
108#define PyUnicode_READY(op) \
109 (assert(_PyUnicode_CHECK(op)), \
110 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200111 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100112 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200113
Victor Stinnerc379ead2011-10-03 12:52:27 +0200114#define _PyUnicode_SHARE_UTF8(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
117 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
118#define _PyUnicode_SHARE_WSTR(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
121
Victor Stinner829c0ad2011-10-03 01:08:02 +0200122/* true if the Unicode object has an allocated UTF-8 memory block
123 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200125 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200126 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
128
Victor Stinner03490912011-10-03 23:45:12 +0200129/* true if the Unicode object has an allocated wstr memory block
130 (not shared with other data) */
131#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200132 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200133 (!PyUnicode_IS_READY(op) || \
134 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
135
Victor Stinner910337b2011-10-03 03:20:16 +0200136/* Generic helper macro to convert characters of different types.
137 from_type and to_type have to be valid type names, begin and end
138 are pointers to the source characters which should be of type
139 "from_type *". to is a pointer of type "to_type *" and points to the
140 buffer where the result characters are written to. */
141#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
142 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200143 to_type *_to = (to_type *) to; \
144 const from_type *_iter = (begin); \
145 const from_type *_end = (end); \
146 Py_ssize_t n = (_end) - (_iter); \
147 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200148 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 while (_iter < (_unrolled_end)) { \
150 _to[0] = (to_type) _iter[0]; \
151 _to[1] = (to_type) _iter[1]; \
152 _to[2] = (to_type) _iter[2]; \
153 _to[3] = (to_type) _iter[3]; \
154 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200155 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_end)) \
157 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200158 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159
Walter Dörwald16807132007-05-25 13:52:07 +0000160/* This dictionary holds all interned unicode strings. Note that references
161 to strings in this dictionary are *not* counted in the string's ob_refcnt.
162 When the interned string reaches a refcnt of 0 the string deallocation
163 function will delete the reference from this dictionary.
164
165 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000166 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000167*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200168static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000169
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000170/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200171static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200172
Serhiy Storchaka678db842013-01-26 12:16:36 +0200173#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200174 do { \
175 if (unicode_empty != NULL) \
176 Py_INCREF(unicode_empty); \
177 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178 unicode_empty = PyUnicode_New(0, 0); \
179 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200180 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
182 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186#define _Py_RETURN_UNICODE_EMPTY() \
187 do { \
188 _Py_INCREF_UNICODE_EMPTY(); \
189 return unicode_empty; \
190 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200192/* Forward declaration */
193Py_LOCAL_INLINE(int)
194_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
195
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200196/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200197static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* Single character Unicode strings in the Latin-1 range are being
200 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202
Christian Heimes190d79e2008-01-30 11:58:22 +0000203/* Fast detection of the most frequent whitespace characters */
204const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000206/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000208/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x000C: * FORM FEED */
210/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 1, 1, 1, 1, 1, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x001C: * FILE SEPARATOR */
214/* case 0x001D: * GROUP SEPARATOR */
215/* case 0x001E: * RECORD SEPARATOR */
216/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 1, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000223
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000232};
233
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200234/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200235static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200236static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100237static int unicode_modifiable(PyObject *unicode);
238
Victor Stinnerfe226c02011-10-03 03:52:20 +0200239
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100241_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200242static PyObject *
243_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
244static PyObject *
245_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
246
247static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000248unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000249 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100250 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000251 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
252
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253static void
254raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300255 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100256 PyObject *unicode,
257 Py_ssize_t startpos, Py_ssize_t endpos,
258 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000259
Christian Heimes190d79e2008-01-30 11:58:22 +0000260/* Same for linebreaks */
261static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000262 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264/* 0x000B, * LINE TABULATION */
265/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x001C, * FILE SEPARATOR */
270/* 0x001D, * GROUP SEPARATOR */
271/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000272 0, 0, 0, 0, 1, 1, 1, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000277
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000286};
287
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300288/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
289 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000291PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000293#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 /* This is actually an illegal character, so it should
297 not be passed to unichr. */
298 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299#endif
300}
301
Victor Stinner910337b2011-10-03 03:20:16 +0200302#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200303int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100304_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200305{
306 PyASCIIObject *ascii;
307 unsigned int kind;
308
309 assert(PyUnicode_Check(op));
310
311 ascii = (PyASCIIObject *)op;
312 kind = ascii->state.kind;
313
Victor Stinnera3b334d2011-10-03 13:53:37 +0200314 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(ascii->state.ready == 1);
317 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200319 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200320 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200321
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 if (ascii->state.compact == 1) {
323 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200324 assert(kind == PyUnicode_1BYTE_KIND
325 || kind == PyUnicode_2BYTE_KIND
326 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 }
331 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
333
334 data = unicode->data.any;
335 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 assert(ascii->length == 0);
337 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.compact == 0);
339 assert(ascii->state.ascii == 0);
340 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 }
346 else {
347 assert(kind == PyUnicode_1BYTE_KIND
348 || kind == PyUnicode_2BYTE_KIND
349 || kind == PyUnicode_4BYTE_KIND);
350 assert(ascii->state.compact == 0);
351 assert(ascii->state.ready == 1);
352 assert(data != NULL);
353 if (ascii->state.ascii) {
354 assert (compact->utf8 == data);
355 assert (compact->utf8_length == ascii->length);
356 }
357 else
358 assert (compact->utf8 != data);
359 }
360 }
361 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200362 if (
363#if SIZEOF_WCHAR_T == 2
364 kind == PyUnicode_2BYTE_KIND
365#else
366 kind == PyUnicode_4BYTE_KIND
367#endif
368 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200369 {
370 assert(ascii->wstr == data);
371 assert(compact->wstr_length == ascii->length);
372 } else
373 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200375
376 if (compact->utf8 == NULL)
377 assert(compact->utf8_length == 0);
378 if (ascii->wstr == NULL)
379 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 /* check that the best kind is used */
382 if (check_content && kind != PyUnicode_WCHAR_KIND)
383 {
384 Py_ssize_t i;
385 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200386 void *data;
387 Py_UCS4 ch;
388
389 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 for (i=0; i < ascii->length; i++)
391 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 if (ch > maxchar)
394 maxchar = ch;
395 }
396 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100397 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 assert(maxchar <= 255);
400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 else
402 assert(maxchar < 128);
403 }
Victor Stinner77faf692011-11-20 18:56:05 +0100404 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 0xFFFF);
407 }
408 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100410 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200412 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200413 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400414 return 1;
415}
Victor Stinner910337b2011-10-03 03:20:16 +0200416#endif
417
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100418static PyObject*
419unicode_result_wchar(PyObject *unicode)
420{
421#ifndef Py_DEBUG
422 Py_ssize_t len;
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100426 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200427 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100432 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200440 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 return NULL;
442 }
443#else
Victor Stinneraa771272012-10-04 02:32:58 +0200444 assert(Py_REFCNT(unicode) == 1);
445
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 /* don't make the result ready in debug mode to ensure that the caller
447 makes the string ready before using it */
448 assert(_PyUnicode_CheckConsistency(unicode, 1));
449#endif
450 return unicode;
451}
452
453static PyObject*
454unicode_result_ready(PyObject *unicode)
455{
456 Py_ssize_t length;
457
458 length = PyUnicode_GET_LENGTH(unicode);
459 if (length == 0) {
460 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100461 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200462 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100463 }
464 return unicode_empty;
465 }
466
467 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200468 void *data = PyUnicode_DATA(unicode);
469 int kind = PyUnicode_KIND(unicode);
470 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100471 if (ch < 256) {
472 PyObject *latin1_char = unicode_latin1[ch];
473 if (latin1_char != NULL) {
474 if (unicode != latin1_char) {
475 Py_INCREF(latin1_char);
476 Py_DECREF(unicode);
477 }
478 return latin1_char;
479 }
480 else {
481 assert(_PyUnicode_CheckConsistency(unicode, 1));
482 Py_INCREF(unicode);
483 unicode_latin1[ch] = unicode;
484 return unicode;
485 }
486 }
487 }
488
489 assert(_PyUnicode_CheckConsistency(unicode, 1));
490 return unicode;
491}
492
493static PyObject*
494unicode_result(PyObject *unicode)
495{
496 assert(_PyUnicode_CHECK(unicode));
497 if (PyUnicode_IS_READY(unicode))
498 return unicode_result_ready(unicode);
499 else
500 return unicode_result_wchar(unicode);
501}
502
Victor Stinnerc4b49542011-12-11 22:44:26 +0100503static PyObject*
504unicode_result_unchanged(PyObject *unicode)
505{
506 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500507 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508 return NULL;
509 Py_INCREF(unicode);
510 return unicode;
511 }
512 else
513 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100514 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515}
516
Victor Stinner3a50e702011-10-18 21:21:00 +0200517#ifdef HAVE_MBCS
518static OSVERSIONINFOEX winver;
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521/* --- Bloom Filters ----------------------------------------------------- */
522
523/* stuff to implement simple "bloom filters" for Unicode characters.
524 to keep things simple, we use a single bitmask, using the least 5
525 bits from each unicode characters as the bit index. */
526
527/* the linebreak mask is set up by Unicode_Init below */
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#if LONG_BIT >= 128
530#define BLOOM_WIDTH 128
531#elif LONG_BIT >= 64
532#define BLOOM_WIDTH 64
533#elif LONG_BIT >= 32
534#define BLOOM_WIDTH 32
535#else
536#error "LONG_BIT is smaller than 32"
537#endif
538
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539#define BLOOM_MASK unsigned long
540
Serhiy Storchaka05997252013-01-26 12:14:02 +0200541static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542
Antoine Pitrouf068f942010-01-13 14:19:12 +0000543#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
Benjamin Peterson29060642009-01-31 22:14:21 +0000545#define BLOOM_LINEBREAK(ch) \
546 ((ch) < 128U ? ascii_linebreak[(ch)] : \
547 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
Alexander Belopolsky40018472011-02-26 01:02:56 +0000549Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551{
Victor Stinnera85af502013-04-09 21:53:54 +0200552#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
553 do { \
554 TYPE *data = (TYPE *)PTR; \
555 TYPE *end = data + LEN; \
556 Py_UCS4 ch; \
557 for (; data != end; data++) { \
558 ch = *data; \
559 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
560 } \
561 break; \
562 } while (0)
563
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564 /* calculate simple bloom-style bitmask for a given unicode string */
565
Antoine Pitrouf068f942010-01-13 14:19:12 +0000566 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567
568 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200569 switch (kind) {
570 case PyUnicode_1BYTE_KIND:
571 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
572 break;
573 case PyUnicode_2BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
575 break;
576 case PyUnicode_4BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
578 break;
579 default:
580 assert(0);
581 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000582 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200583
584#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585}
586
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200587/* Compilation of templated routines */
588
589#include "stringlib/asciilib.h"
590#include "stringlib/fastsearch.h"
591#include "stringlib/partition.h"
592#include "stringlib/split.h"
593#include "stringlib/count.h"
594#include "stringlib/find.h"
595#include "stringlib/find_max_char.h"
596#include "stringlib/localeutil.h"
597#include "stringlib/undef.h"
598
599#include "stringlib/ucs1lib.h"
600#include "stringlib/fastsearch.h"
601#include "stringlib/partition.h"
602#include "stringlib/split.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300605#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs2lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300616#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200617#include "stringlib/find_max_char.h"
618#include "stringlib/localeutil.h"
619#include "stringlib/undef.h"
620
621#include "stringlib/ucs4lib.h"
622#include "stringlib/fastsearch.h"
623#include "stringlib/partition.h"
624#include "stringlib/split.h"
625#include "stringlib/count.h"
626#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300627#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200628#include "stringlib/find_max_char.h"
629#include "stringlib/localeutil.h"
630#include "stringlib/undef.h"
631
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200632#include "stringlib/unicodedefs.h"
633#include "stringlib/fastsearch.h"
634#include "stringlib/count.h"
635#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100636#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638/* --- Unicode Object ----------------------------------------------------- */
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200641fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200643Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
644 Py_ssize_t size, Py_UCS4 ch,
645 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
648
649 switch (kind) {
650 case PyUnicode_1BYTE_KIND:
651 {
652 Py_UCS1 ch1 = (Py_UCS1) ch;
653 if (ch1 == ch)
654 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
655 else
656 return -1;
657 }
658 case PyUnicode_2BYTE_KIND:
659 {
660 Py_UCS2 ch2 = (Py_UCS2) ch;
661 if (ch2 == ch)
662 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
663 else
664 return -1;
665 }
666 case PyUnicode_4BYTE_KIND:
667 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
668 default:
669 assert(0);
670 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672}
673
Victor Stinnerafffce42012-10-03 23:03:17 +0200674#ifdef Py_DEBUG
675/* Fill the data of an Unicode string with invalid characters to detect bugs
676 earlier.
677
678 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
679 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
680 invalid character in Unicode 6.0. */
681static void
682unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
683{
684 int kind = PyUnicode_KIND(unicode);
685 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
686 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
687 if (length <= old_length)
688 return;
689 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
690}
691#endif
692
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693static PyObject*
694resize_compact(PyObject *unicode, Py_ssize_t length)
695{
696 Py_ssize_t char_size;
697 Py_ssize_t struct_size;
698 Py_ssize_t new_size;
699 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100700 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
703#endif
704
Victor Stinner79891572012-05-03 13:43:07 +0200705 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100707 assert(PyUnicode_IS_COMPACT(unicode));
708
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200709 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100710 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 struct_size = sizeof(PyASCIIObject);
712 else
713 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
717 PyErr_NoMemory();
718 return NULL;
719 }
720 new_size = (struct_size + (length + 1) * char_size);
721
Victor Stinner84def372011-12-11 20:04:56 +0100722 _Py_DEC_REFTOTAL;
723 _Py_ForgetReference(unicode);
724
725 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
726 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100727 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200728 PyErr_NoMemory();
729 return NULL;
730 }
Victor Stinner84def372011-12-11 20:04:56 +0100731 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100733
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100737 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 _PyUnicode_WSTR_LENGTH(unicode) = length;
739 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100740 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
741 PyObject_DEL(_PyUnicode_WSTR(unicode));
742 _PyUnicode_WSTR(unicode) = NULL;
743 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200744#ifdef Py_DEBUG
745 unicode_fill_invalid(unicode, old_length);
746#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
748 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 return unicode;
751}
752
Alexander Belopolsky40018472011-02-26 01:02:56 +0000753static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200754resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755{
Victor Stinner95663112011-10-04 01:03:50 +0200756 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000760
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 if (PyUnicode_IS_READY(unicode)) {
762 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200765#ifdef Py_DEBUG
766 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
767#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200770 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200771 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
772 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
775 PyErr_NoMemory();
776 return -1;
777 }
778 new_size = (length + 1) * char_size;
779
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
781 {
782 PyObject_DEL(_PyUnicode_UTF8(unicode));
783 _PyUnicode_UTF8(unicode) = NULL;
784 _PyUnicode_UTF8_LENGTH(unicode) = 0;
785 }
786
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 data = (PyObject *)PyObject_REALLOC(data, new_size);
788 if (data == NULL) {
789 PyErr_NoMemory();
790 return -1;
791 }
792 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200793 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200795 _PyUnicode_WSTR_LENGTH(unicode) = length;
796 }
797 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200798 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200799 _PyUnicode_UTF8_LENGTH(unicode) = length;
800 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200801 _PyUnicode_LENGTH(unicode) = length;
802 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200803#ifdef Py_DEBUG
804 unicode_fill_invalid(unicode, old_length);
805#endif
Victor Stinner95663112011-10-04 01:03:50 +0200806 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200807 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200809 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200810 }
Victor Stinner95663112011-10-04 01:03:50 +0200811 assert(_PyUnicode_WSTR(unicode) != NULL);
812
813 /* check for integer overflow */
814 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
815 PyErr_NoMemory();
816 return -1;
817 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100818 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200819 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100820 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200821 if (!wstr) {
822 PyErr_NoMemory();
823 return -1;
824 }
825 _PyUnicode_WSTR(unicode) = wstr;
826 _PyUnicode_WSTR(unicode)[length] = 0;
827 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200828 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 return 0;
830}
831
Victor Stinnerfe226c02011-10-03 03:52:20 +0200832static PyObject*
833resize_copy(PyObject *unicode, Py_ssize_t length)
834{
835 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100836 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100838
Benjamin Petersonbac79492012-01-14 13:34:47 -0500839 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100840 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200841
842 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
843 if (copy == NULL)
844 return NULL;
845
846 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200847 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200848 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200849 }
850 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200851 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100852
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200853 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200854 if (w == NULL)
855 return NULL;
856 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
857 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200858 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
859 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200860 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200861 }
862}
863
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000865 Ux0000 terminated; some code (e.g. new_identifier)
866 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
868 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000869 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871*/
872
Alexander Belopolsky40018472011-02-26 01:02:56 +0000873static PyUnicodeObject *
874_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200876 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880 if (length == 0 && unicode_empty != NULL) {
881 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200882 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 }
884
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000885 /* Ensure we won't overflow the size. */
886 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
887 return (PyUnicodeObject *)PyErr_NoMemory();
888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 if (length < 0) {
890 PyErr_SetString(PyExc_SystemError,
891 "Negative size passed to _PyUnicode_New");
892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
896 if (unicode == NULL)
897 return NULL;
898 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
899 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
900 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100901 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000902 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100903 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905
Jeremy Hyltond8082792003-09-16 19:41:39 +0000906 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000907 * the caller fails before initializing str -- unicode_resize()
908 * reads str[0], and the Keep-Alive optimization can keep memory
909 * allocated for str alive across a call to unicode_dealloc(unicode).
910 * We don't want unicode_resize to read uninitialized memory in
911 * that case.
912 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913 _PyUnicode_WSTR(unicode)[0] = 0;
914 _PyUnicode_WSTR(unicode)[length] = 0;
915 _PyUnicode_WSTR_LENGTH(unicode) = length;
916 _PyUnicode_HASH(unicode) = -1;
917 _PyUnicode_STATE(unicode).interned = 0;
918 _PyUnicode_STATE(unicode).kind = 0;
919 _PyUnicode_STATE(unicode).compact = 0;
920 _PyUnicode_STATE(unicode).ready = 0;
921 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200922 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200924 _PyUnicode_UTF8(unicode) = NULL;
925 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100926 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000927 return unicode;
928}
929
Victor Stinnerf42dc442011-10-02 23:33:16 +0200930static const char*
931unicode_kind_name(PyObject *unicode)
932{
Victor Stinner42dfd712011-10-03 14:41:45 +0200933 /* don't check consistency: unicode_kind_name() is called from
934 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200935 if (!PyUnicode_IS_COMPACT(unicode))
936 {
937 if (!PyUnicode_IS_READY(unicode))
938 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600939 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200940 {
941 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200942 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 return "legacy ascii";
944 else
945 return "legacy latin1";
946 case PyUnicode_2BYTE_KIND:
947 return "legacy UCS2";
948 case PyUnicode_4BYTE_KIND:
949 return "legacy UCS4";
950 default:
951 return "<legacy invalid kind>";
952 }
953 }
954 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600955 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200956 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200957 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200958 return "ascii";
959 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200960 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200961 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200962 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200963 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200965 default:
966 return "<invalid compact kind>";
967 }
968}
969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971/* Functions wrapping macros for use in debugger */
972char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200973 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974}
975
976void *_PyUnicode_compact_data(void *unicode) {
977 return _PyUnicode_COMPACT_DATA(unicode);
978}
979void *_PyUnicode_data(void *unicode){
980 printf("obj %p\n", unicode);
981 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
982 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
983 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
984 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
985 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
986 return PyUnicode_DATA(unicode);
987}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200988
989void
990_PyUnicode_Dump(PyObject *op)
991{
992 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200993 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
994 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
995 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200996
Victor Stinnera849a4b2011-10-03 12:12:11 +0200997 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200998 {
999 if (ascii->state.ascii)
1000 data = (ascii + 1);
1001 else
1002 data = (compact + 1);
1003 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001004 else
1005 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1007
Victor Stinnera849a4b2011-10-03 12:12:11 +02001008 if (ascii->wstr == data)
1009 printf("shared ");
1010 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001011
Victor Stinnera3b334d2011-10-03 13:53:37 +02001012 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001013 printf(" (%zu), ", compact->wstr_length);
1014 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1015 printf("shared ");
1016 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001017 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001018 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001019}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001020#endif
1021
1022PyObject *
1023PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1024{
1025 PyObject *obj;
1026 PyCompactUnicodeObject *unicode;
1027 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001028 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001029 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 Py_ssize_t char_size;
1031 Py_ssize_t struct_size;
1032
1033 /* Optimization for empty strings */
1034 if (size == 0 && unicode_empty != NULL) {
1035 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001036 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 }
1038
Victor Stinner9e9d6892011-10-04 01:02:02 +02001039 is_ascii = 0;
1040 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 struct_size = sizeof(PyCompactUnicodeObject);
1042 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001043 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 char_size = 1;
1045 is_ascii = 1;
1046 struct_size = sizeof(PyASCIIObject);
1047 }
1048 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001049 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 char_size = 1;
1051 }
1052 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001053 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 char_size = 2;
1055 if (sizeof(wchar_t) == 2)
1056 is_sharing = 1;
1057 }
1058 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001059 if (maxchar > MAX_UNICODE) {
1060 PyErr_SetString(PyExc_SystemError,
1061 "invalid maximum character passed to PyUnicode_New");
1062 return NULL;
1063 }
Victor Stinner8f825062012-04-27 13:55:39 +02001064 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 char_size = 4;
1066 if (sizeof(wchar_t) == 4)
1067 is_sharing = 1;
1068 }
1069
1070 /* Ensure we won't overflow the size. */
1071 if (size < 0) {
1072 PyErr_SetString(PyExc_SystemError,
1073 "Negative size passed to PyUnicode_New");
1074 return NULL;
1075 }
1076 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1077 return PyErr_NoMemory();
1078
1079 /* Duplicated allocation code from _PyObject_New() instead of a call to
1080 * PyObject_New() so we are able to allocate space for the object and
1081 * it's data buffer.
1082 */
1083 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1084 if (obj == NULL)
1085 return PyErr_NoMemory();
1086 obj = PyObject_INIT(obj, &PyUnicode_Type);
1087 if (obj == NULL)
1088 return NULL;
1089
1090 unicode = (PyCompactUnicodeObject *)obj;
1091 if (is_ascii)
1092 data = ((PyASCIIObject*)obj) + 1;
1093 else
1094 data = unicode + 1;
1095 _PyUnicode_LENGTH(unicode) = size;
1096 _PyUnicode_HASH(unicode) = -1;
1097 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001098 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099 _PyUnicode_STATE(unicode).compact = 1;
1100 _PyUnicode_STATE(unicode).ready = 1;
1101 _PyUnicode_STATE(unicode).ascii = is_ascii;
1102 if (is_ascii) {
1103 ((char*)data)[size] = 0;
1104 _PyUnicode_WSTR(unicode) = NULL;
1105 }
Victor Stinner8f825062012-04-27 13:55:39 +02001106 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 ((char*)data)[size] = 0;
1108 _PyUnicode_WSTR(unicode) = NULL;
1109 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001111 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 else {
1114 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001115 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001116 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001118 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 ((Py_UCS4*)data)[size] = 0;
1120 if (is_sharing) {
1121 _PyUnicode_WSTR_LENGTH(unicode) = size;
1122 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1123 }
1124 else {
1125 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1126 _PyUnicode_WSTR(unicode) = NULL;
1127 }
1128 }
Victor Stinner8f825062012-04-27 13:55:39 +02001129#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001130 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001131#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001132 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133 return obj;
1134}
1135
1136#if SIZEOF_WCHAR_T == 2
1137/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1138 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001139 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140
1141 This function assumes that unicode can hold one more code point than wstr
1142 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001143static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001145 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
1147 const wchar_t *iter;
1148 Py_UCS4 *ucs4_out;
1149
Victor Stinner910337b2011-10-03 03:20:16 +02001150 assert(unicode != NULL);
1151 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1153 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1154
1155 for (iter = begin; iter < end; ) {
1156 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1157 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001158 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1159 && (iter+1) < end
1160 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161 {
Victor Stinner551ac952011-11-29 22:58:13 +01001162 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 iter += 2;
1164 }
1165 else {
1166 *ucs4_out++ = *iter;
1167 iter++;
1168 }
1169 }
1170 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1171 _PyUnicode_GET_LENGTH(unicode)));
1172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173}
1174#endif
1175
Victor Stinnercd9950f2011-10-02 00:34:53 +02001176static int
Victor Stinner488fa492011-12-12 00:01:39 +01001177unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001178{
Victor Stinner488fa492011-12-12 00:01:39 +01001179 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001180 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001181 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001182 return -1;
1183 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184 return 0;
1185}
1186
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001187static int
1188_copy_characters(PyObject *to, Py_ssize_t to_start,
1189 PyObject *from, Py_ssize_t from_start,
1190 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001192 unsigned int from_kind, to_kind;
1193 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194
Victor Stinneree4544c2012-05-09 22:24:08 +02001195 assert(0 <= how_many);
1196 assert(0 <= from_start);
1197 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001198 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001199 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001200 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201
Victor Stinnerd3f08822012-05-29 12:57:52 +02001202 assert(PyUnicode_Check(to));
1203 assert(PyUnicode_IS_READY(to));
1204 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1205
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001206 if (how_many == 0)
1207 return 0;
1208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001212 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213
Victor Stinnerf1852262012-06-16 16:38:26 +02001214#ifdef Py_DEBUG
1215 if (!check_maxchar
1216 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1217 {
1218 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1219 Py_UCS4 ch;
1220 Py_ssize_t i;
1221 for (i=0; i < how_many; i++) {
1222 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1223 assert(ch <= to_maxchar);
1224 }
1225 }
1226#endif
1227
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001228 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001229 if (check_maxchar
1230 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1231 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 /* Writing Latin-1 characters into an ASCII string requires to
1233 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001234 Py_UCS4 max_char;
1235 max_char = ucs1lib_find_max_char(from_data,
1236 (Py_UCS1*)from_data + how_many);
1237 if (max_char >= 128)
1238 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001239 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001240 Py_MEMCPY((char*)to_data + to_kind * to_start,
1241 (char*)from_data + from_kind * from_start,
1242 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001244 else if (from_kind == PyUnicode_1BYTE_KIND
1245 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001246 {
1247 _PyUnicode_CONVERT_BYTES(
1248 Py_UCS1, Py_UCS2,
1249 PyUnicode_1BYTE_DATA(from) + from_start,
1250 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1251 PyUnicode_2BYTE_DATA(to) + to_start
1252 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001253 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001254 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001255 && to_kind == PyUnicode_4BYTE_KIND)
1256 {
1257 _PyUnicode_CONVERT_BYTES(
1258 Py_UCS1, Py_UCS4,
1259 PyUnicode_1BYTE_DATA(from) + from_start,
1260 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1261 PyUnicode_4BYTE_DATA(to) + to_start
1262 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 }
1264 else if (from_kind == PyUnicode_2BYTE_KIND
1265 && to_kind == PyUnicode_4BYTE_KIND)
1266 {
1267 _PyUnicode_CONVERT_BYTES(
1268 Py_UCS2, Py_UCS4,
1269 PyUnicode_2BYTE_DATA(from) + from_start,
1270 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1271 PyUnicode_4BYTE_DATA(to) + to_start
1272 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001273 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001274 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001275 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1276
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001277 if (!check_maxchar) {
1278 if (from_kind == PyUnicode_2BYTE_KIND
1279 && to_kind == PyUnicode_1BYTE_KIND)
1280 {
1281 _PyUnicode_CONVERT_BYTES(
1282 Py_UCS2, Py_UCS1,
1283 PyUnicode_2BYTE_DATA(from) + from_start,
1284 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1285 PyUnicode_1BYTE_DATA(to) + to_start
1286 );
1287 }
1288 else if (from_kind == PyUnicode_4BYTE_KIND
1289 && to_kind == PyUnicode_1BYTE_KIND)
1290 {
1291 _PyUnicode_CONVERT_BYTES(
1292 Py_UCS4, Py_UCS1,
1293 PyUnicode_4BYTE_DATA(from) + from_start,
1294 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1295 PyUnicode_1BYTE_DATA(to) + to_start
1296 );
1297 }
1298 else if (from_kind == PyUnicode_4BYTE_KIND
1299 && to_kind == PyUnicode_2BYTE_KIND)
1300 {
1301 _PyUnicode_CONVERT_BYTES(
1302 Py_UCS4, Py_UCS2,
1303 PyUnicode_4BYTE_DATA(from) + from_start,
1304 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1305 PyUnicode_2BYTE_DATA(to) + to_start
1306 );
1307 }
1308 else {
1309 assert(0);
1310 return -1;
1311 }
1312 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001313 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001314 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001315 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001316 Py_ssize_t i;
1317
Victor Stinnera0702ab2011-09-29 14:14:38 +02001318 for (i=0; i < how_many; i++) {
1319 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001320 if (ch > to_maxchar)
1321 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1323 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 }
1325 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001326 return 0;
1327}
1328
Victor Stinnerd3f08822012-05-29 12:57:52 +02001329void
1330_PyUnicode_FastCopyCharacters(
1331 PyObject *to, Py_ssize_t to_start,
1332 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001333{
1334 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1335}
1336
1337Py_ssize_t
1338PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1339 PyObject *from, Py_ssize_t from_start,
1340 Py_ssize_t how_many)
1341{
1342 int err;
1343
1344 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1345 PyErr_BadInternalCall();
1346 return -1;
1347 }
1348
Benjamin Petersonbac79492012-01-14 13:34:47 -05001349 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001350 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001351 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001352 return -1;
1353
Victor Stinnerd3f08822012-05-29 12:57:52 +02001354 if (from_start < 0) {
1355 PyErr_SetString(PyExc_IndexError, "string index out of range");
1356 return -1;
1357 }
1358 if (to_start < 0) {
1359 PyErr_SetString(PyExc_IndexError, "string index out of range");
1360 return -1;
1361 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001362 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1363 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1364 PyErr_Format(PyExc_SystemError,
1365 "Cannot write %zi characters at %zi "
1366 "in a string of %zi characters",
1367 how_many, to_start, PyUnicode_GET_LENGTH(to));
1368 return -1;
1369 }
1370
1371 if (how_many == 0)
1372 return 0;
1373
Victor Stinner488fa492011-12-12 00:01:39 +01001374 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001375 return -1;
1376
1377 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1378 if (err) {
1379 PyErr_Format(PyExc_SystemError,
1380 "Cannot copy %s characters "
1381 "into a string of %s characters",
1382 unicode_kind_name(from),
1383 unicode_kind_name(to));
1384 return -1;
1385 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001386 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387}
1388
Victor Stinner17222162011-09-28 22:15:37 +02001389/* Find the maximum code point and count the number of surrogate pairs so a
1390 correct string length can be computed before converting a string to UCS4.
1391 This function counts single surrogates as a character and not as a pair.
1392
1393 Return 0 on success, or -1 on error. */
1394static int
1395find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1396 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397{
1398 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001399 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400
Victor Stinnerc53be962011-10-02 21:33:54 +02001401 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 *num_surrogates = 0;
1403 *maxchar = 0;
1404
1405 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001407 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1408 && (iter+1) < end
1409 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1410 {
1411 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1412 ++(*num_surrogates);
1413 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 }
1415 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001417 {
1418 ch = *iter;
1419 iter++;
1420 }
1421 if (ch > *maxchar) {
1422 *maxchar = ch;
1423 if (*maxchar > MAX_UNICODE) {
1424 PyErr_Format(PyExc_ValueError,
1425 "character U+%x is not in range [U+0000; U+10ffff]",
1426 ch);
1427 return -1;
1428 }
1429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 }
1431 return 0;
1432}
1433
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001434int
1435_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436{
1437 wchar_t *end;
1438 Py_UCS4 maxchar = 0;
1439 Py_ssize_t num_surrogates;
1440#if SIZEOF_WCHAR_T == 2
1441 Py_ssize_t length_wo_surrogates;
1442#endif
1443
Georg Brandl7597add2011-10-05 16:36:47 +02001444 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001445 strings were created using _PyObject_New() and where no canonical
1446 representation (the str field) has been set yet aka strings
1447 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001448 assert(_PyUnicode_CHECK(unicode));
1449 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001451 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 /* Actually, it should neither be interned nor be anything else: */
1454 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001457 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001458 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1463 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 PyErr_NoMemory();
1465 return -1;
1466 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001467 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 _PyUnicode_WSTR(unicode), end,
1469 PyUnicode_1BYTE_DATA(unicode));
1470 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1471 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1472 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1473 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001474 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001475 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001476 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 }
1478 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001479 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001480 _PyUnicode_UTF8(unicode) = NULL;
1481 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 }
1483 PyObject_FREE(_PyUnicode_WSTR(unicode));
1484 _PyUnicode_WSTR(unicode) = NULL;
1485 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1486 }
1487 /* In this case we might have to convert down from 4-byte native
1488 wchar_t to 2-byte unicode. */
1489 else if (maxchar < 65536) {
1490 assert(num_surrogates == 0 &&
1491 "FindMaxCharAndNumSurrogatePairs() messed up");
1492
Victor Stinner506f5922011-09-28 22:34:18 +02001493#if SIZEOF_WCHAR_T == 2
1494 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001495 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001496 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1497 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1498 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001499 _PyUnicode_UTF8(unicode) = NULL;
1500 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001501#else
1502 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001504 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001505 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001506 PyErr_NoMemory();
1507 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001508 }
Victor Stinner506f5922011-09-28 22:34:18 +02001509 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1510 _PyUnicode_WSTR(unicode), end,
1511 PyUnicode_2BYTE_DATA(unicode));
1512 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1513 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1514 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001515 _PyUnicode_UTF8(unicode) = NULL;
1516 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001517 PyObject_FREE(_PyUnicode_WSTR(unicode));
1518 _PyUnicode_WSTR(unicode) = NULL;
1519 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1520#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 }
1522 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1523 else {
1524#if SIZEOF_WCHAR_T == 2
1525 /* in case the native representation is 2-bytes, we need to allocate a
1526 new normalized 4-byte version. */
1527 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001528 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1529 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001530 PyErr_NoMemory();
1531 return -1;
1532 }
1533 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1534 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001535 _PyUnicode_UTF8(unicode) = NULL;
1536 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001537 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1538 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001539 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 PyObject_FREE(_PyUnicode_WSTR(unicode));
1541 _PyUnicode_WSTR(unicode) = NULL;
1542 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1543#else
1544 assert(num_surrogates == 0);
1545
Victor Stinnerc3c74152011-10-02 20:39:55 +02001546 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001548 _PyUnicode_UTF8(unicode) = NULL;
1549 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1551#endif
1552 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1553 }
1554 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001555 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001556 return 0;
1557}
1558
Alexander Belopolsky40018472011-02-26 01:02:56 +00001559static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001560unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561{
Walter Dörwald16807132007-05-25 13:52:07 +00001562 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001563 case SSTATE_NOT_INTERNED:
1564 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001565
Benjamin Peterson29060642009-01-31 22:14:21 +00001566 case SSTATE_INTERNED_MORTAL:
1567 /* revive dead object temporarily for DelItem */
1568 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001569 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 Py_FatalError(
1571 "deletion of interned string failed");
1572 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 case SSTATE_INTERNED_IMMORTAL:
1575 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001576
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 default:
1578 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001579 }
1580
Victor Stinner03490912011-10-03 23:45:12 +02001581 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001582 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001583 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001584 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001585 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1586 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001588 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589}
1590
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001591#ifdef Py_DEBUG
1592static int
1593unicode_is_singleton(PyObject *unicode)
1594{
1595 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1596 if (unicode == unicode_empty)
1597 return 1;
1598 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1599 {
1600 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1601 if (ch < 256 && unicode_latin1[ch] == unicode)
1602 return 1;
1603 }
1604 return 0;
1605}
1606#endif
1607
Alexander Belopolsky40018472011-02-26 01:02:56 +00001608static int
Victor Stinner488fa492011-12-12 00:01:39 +01001609unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001610{
Victor Stinner488fa492011-12-12 00:01:39 +01001611 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 if (Py_REFCNT(unicode) != 1)
1613 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001614 if (_PyUnicode_HASH(unicode) != -1)
1615 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001616 if (PyUnicode_CHECK_INTERNED(unicode))
1617 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001618 if (!PyUnicode_CheckExact(unicode))
1619 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001620#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001621 /* singleton refcount is greater than 1 */
1622 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001623#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 return 1;
1625}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001626
Victor Stinnerfe226c02011-10-03 03:52:20 +02001627static int
1628unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1629{
1630 PyObject *unicode;
1631 Py_ssize_t old_length;
1632
1633 assert(p_unicode != NULL);
1634 unicode = *p_unicode;
1635
1636 assert(unicode != NULL);
1637 assert(PyUnicode_Check(unicode));
1638 assert(0 <= length);
1639
Victor Stinner910337b2011-10-03 03:20:16 +02001640 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 old_length = PyUnicode_WSTR_LENGTH(unicode);
1642 else
1643 old_length = PyUnicode_GET_LENGTH(unicode);
1644 if (old_length == length)
1645 return 0;
1646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001648 _Py_INCREF_UNICODE_EMPTY();
1649 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001650 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001651 Py_DECREF(*p_unicode);
1652 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001653 return 0;
1654 }
1655
Victor Stinner488fa492011-12-12 00:01:39 +01001656 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001657 PyObject *copy = resize_copy(unicode, length);
1658 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001659 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001660 Py_DECREF(*p_unicode);
1661 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001663 }
1664
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001666 PyObject *new_unicode = resize_compact(unicode, length);
1667 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001669 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001670 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001671 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001672 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001673}
1674
Alexander Belopolsky40018472011-02-26 01:02:56 +00001675int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001677{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 PyObject *unicode;
1679 if (p_unicode == NULL) {
1680 PyErr_BadInternalCall();
1681 return -1;
1682 }
1683 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001684 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001685 {
1686 PyErr_BadInternalCall();
1687 return -1;
1688 }
1689 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001690}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001691
Victor Stinnerc5166102012-02-22 13:55:02 +01001692/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001693
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001694 WARNING: The function doesn't copy the terminating null character and
1695 doesn't check the maximum character (may write a latin1 character in an
1696 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001697static void
1698unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1699 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001700{
1701 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1702 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001703 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001704
1705 switch (kind) {
1706 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001707 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001708#ifdef Py_DEBUG
1709 if (PyUnicode_IS_ASCII(unicode)) {
1710 Py_UCS4 maxchar = ucs1lib_find_max_char(
1711 (const Py_UCS1*)str,
1712 (const Py_UCS1*)str + len);
1713 assert(maxchar < 128);
1714 }
1715#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001716 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001717 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001718 }
1719 case PyUnicode_2BYTE_KIND: {
1720 Py_UCS2 *start = (Py_UCS2 *)data + index;
1721 Py_UCS2 *ucs2 = start;
1722 assert(index <= PyUnicode_GET_LENGTH(unicode));
1723
Victor Stinner184252a2012-06-16 02:57:41 +02001724 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001725 *ucs2 = (Py_UCS2)*str;
1726
1727 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001728 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001729 }
1730 default: {
1731 Py_UCS4 *start = (Py_UCS4 *)data + index;
1732 Py_UCS4 *ucs4 = start;
1733 assert(kind == PyUnicode_4BYTE_KIND);
1734 assert(index <= PyUnicode_GET_LENGTH(unicode));
1735
Victor Stinner184252a2012-06-16 02:57:41 +02001736 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 *ucs4 = (Py_UCS4)*str;
1738
1739 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001740 }
1741 }
1742}
1743
1744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745static PyObject*
1746get_latin1_char(unsigned char ch)
1747{
Victor Stinnera464fc12011-10-02 20:39:30 +02001748 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001750 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 if (!unicode)
1752 return NULL;
1753 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001754 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 unicode_latin1[ch] = unicode;
1756 }
1757 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001758 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759}
1760
Alexander Belopolsky40018472011-02-26 01:02:56 +00001761PyObject *
1762PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001764 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 Py_UCS4 maxchar = 0;
1766 Py_ssize_t num_surrogates;
1767
1768 if (u == NULL)
1769 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001771 /* If the Unicode data is known at construction time, we can apply
1772 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001775 if (size == 0)
1776 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 /* Single character Unicode objects in the Latin-1 range are
1779 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001780 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 return get_latin1_char((unsigned char)*u);
1782
1783 /* If not empty and not single character, copy the Unicode data
1784 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001785 if (find_maxchar_surrogates(u, u + size,
1786 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 return NULL;
1788
Victor Stinner8faf8212011-12-08 22:14:11 +01001789 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 if (!unicode)
1791 return NULL;
1792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 switch (PyUnicode_KIND(unicode)) {
1794 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001795 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1797 break;
1798 case PyUnicode_2BYTE_KIND:
1799#if Py_UNICODE_SIZE == 2
1800 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1801#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001802 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1804#endif
1805 break;
1806 case PyUnicode_4BYTE_KIND:
1807#if SIZEOF_WCHAR_T == 2
1808 /* This is the only case which has to process surrogates, thus
1809 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001810 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811#else
1812 assert(num_surrogates == 0);
1813 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1814#endif
1815 break;
1816 default:
1817 assert(0 && "Impossible state");
1818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821}
1822
Alexander Belopolsky40018472011-02-26 01:02:56 +00001823PyObject *
1824PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001825{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 if (size < 0) {
1827 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001828 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001829 return NULL;
1830 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001831 if (u != NULL)
1832 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1833 else
1834 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001835}
1836
Alexander Belopolsky40018472011-02-26 01:02:56 +00001837PyObject *
1838PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001839{
1840 size_t size = strlen(u);
1841 if (size > PY_SSIZE_T_MAX) {
1842 PyErr_SetString(PyExc_OverflowError, "input too long");
1843 return NULL;
1844 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001845 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001846}
1847
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001848PyObject *
1849_PyUnicode_FromId(_Py_Identifier *id)
1850{
1851 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001852 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1853 strlen(id->string),
1854 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855 if (!id->object)
1856 return NULL;
1857 PyUnicode_InternInPlace(&id->object);
1858 assert(!id->next);
1859 id->next = static_strings;
1860 static_strings = id;
1861 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001862 return id->object;
1863}
1864
1865void
1866_PyUnicode_ClearStaticStrings()
1867{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001868 _Py_Identifier *tmp, *s = static_strings;
1869 while (s) {
1870 Py_DECREF(s->object);
1871 s->object = NULL;
1872 tmp = s->next;
1873 s->next = NULL;
1874 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001875 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001876 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001877}
1878
Benjamin Peterson0df54292012-03-26 14:50:32 -04001879/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001880
Victor Stinnerd3f08822012-05-29 12:57:52 +02001881PyObject*
1882_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001883{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001884 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001885 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001886 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001887#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001888 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001889#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001890 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001891 }
Victor Stinner785938e2011-12-11 20:09:03 +01001892 unicode = PyUnicode_New(size, 127);
1893 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001894 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001895 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1896 assert(_PyUnicode_CheckConsistency(unicode, 1));
1897 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001898}
1899
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001900static Py_UCS4
1901kind_maxchar_limit(unsigned int kind)
1902{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001903 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001904 case PyUnicode_1BYTE_KIND:
1905 return 0x80;
1906 case PyUnicode_2BYTE_KIND:
1907 return 0x100;
1908 case PyUnicode_4BYTE_KIND:
1909 return 0x10000;
1910 default:
1911 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001912 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001913 }
1914}
1915
Victor Stinnere6abb482012-05-02 01:15:40 +02001916Py_LOCAL_INLINE(Py_UCS4)
1917align_maxchar(Py_UCS4 maxchar)
1918{
1919 if (maxchar <= 127)
1920 return 127;
1921 else if (maxchar <= 255)
1922 return 255;
1923 else if (maxchar <= 65535)
1924 return 65535;
1925 else
1926 return MAX_UNICODE;
1927}
1928
Victor Stinner702c7342011-10-05 13:50:52 +02001929static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001930_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001933 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001934
Serhiy Storchaka678db842013-01-26 12:16:36 +02001935 if (size == 0)
1936 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001937 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001938 if (size == 1)
1939 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001940
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001941 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001942 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 if (!res)
1944 return NULL;
1945 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001946 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001948}
1949
Victor Stinnere57b1c02011-09-28 22:20:48 +02001950static PyObject*
1951_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952{
1953 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001954 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001955
Serhiy Storchaka678db842013-01-26 12:16:36 +02001956 if (size == 0)
1957 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001958 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001959 if (size == 1) {
1960 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001961 int kind;
1962 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001963 if (ch < 256)
1964 return get_latin1_char((unsigned char)ch);
1965
1966 res = PyUnicode_New(1, ch);
1967 if (res == NULL)
1968 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001969 kind = PyUnicode_KIND(res);
1970 data = PyUnicode_DATA(res);
1971 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001972 assert(_PyUnicode_CheckConsistency(res, 1));
1973 return res;
1974 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001977 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (!res)
1979 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001982 else {
1983 _PyUnicode_CONVERT_BYTES(
1984 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1985 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001986 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 return res;
1988}
1989
Victor Stinnere57b1c02011-09-28 22:20:48 +02001990static PyObject*
1991_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992{
1993 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001994 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001995
Serhiy Storchaka678db842013-01-26 12:16:36 +02001996 if (size == 0)
1997 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001998 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001999 if (size == 1) {
2000 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002001 int kind;
2002 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002003 if (ch < 256)
2004 return get_latin1_char((unsigned char)ch);
2005
2006 res = PyUnicode_New(1, ch);
2007 if (res == NULL)
2008 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002009 kind = PyUnicode_KIND(res);
2010 data = PyUnicode_DATA(res);
2011 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002012 assert(_PyUnicode_CheckConsistency(res, 1));
2013 return res;
2014 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002015
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002016 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002017 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 if (!res)
2019 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002020 if (max_char < 256)
2021 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2022 PyUnicode_1BYTE_DATA(res));
2023 else if (max_char < 0x10000)
2024 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2025 PyUnicode_2BYTE_DATA(res));
2026 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002028 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 return res;
2030}
2031
2032PyObject*
2033PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2034{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002035 if (size < 0) {
2036 PyErr_SetString(PyExc_ValueError, "size must be positive");
2037 return NULL;
2038 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002039 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002041 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002043 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002045 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002046 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002047 PyErr_SetString(PyExc_SystemError, "invalid kind");
2048 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050}
2051
Victor Stinnerece58de2012-04-23 23:36:38 +02002052Py_UCS4
2053_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2054{
2055 enum PyUnicode_Kind kind;
2056 void *startptr, *endptr;
2057
2058 assert(PyUnicode_IS_READY(unicode));
2059 assert(0 <= start);
2060 assert(end <= PyUnicode_GET_LENGTH(unicode));
2061 assert(start <= end);
2062
2063 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2064 return PyUnicode_MAX_CHAR_VALUE(unicode);
2065
2066 if (start == end)
2067 return 127;
2068
Victor Stinner94d558b2012-04-27 22:26:58 +02002069 if (PyUnicode_IS_ASCII(unicode))
2070 return 127;
2071
Victor Stinnerece58de2012-04-23 23:36:38 +02002072 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002073 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002074 endptr = (char *)startptr + end * kind;
2075 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002076 switch(kind) {
2077 case PyUnicode_1BYTE_KIND:
2078 return ucs1lib_find_max_char(startptr, endptr);
2079 case PyUnicode_2BYTE_KIND:
2080 return ucs2lib_find_max_char(startptr, endptr);
2081 case PyUnicode_4BYTE_KIND:
2082 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002083 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002084 assert(0);
2085 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002086 }
2087}
2088
Victor Stinner25a4b292011-10-06 12:31:55 +02002089/* Ensure that a string uses the most efficient storage, if it is not the
2090 case: create a new string with of the right kind. Write NULL into *p_unicode
2091 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002092static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002093unicode_adjust_maxchar(PyObject **p_unicode)
2094{
2095 PyObject *unicode, *copy;
2096 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002097 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002098 unsigned int kind;
2099
2100 assert(p_unicode != NULL);
2101 unicode = *p_unicode;
2102 assert(PyUnicode_IS_READY(unicode));
2103 if (PyUnicode_IS_ASCII(unicode))
2104 return;
2105
2106 len = PyUnicode_GET_LENGTH(unicode);
2107 kind = PyUnicode_KIND(unicode);
2108 if (kind == PyUnicode_1BYTE_KIND) {
2109 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002110 max_char = ucs1lib_find_max_char(u, u + len);
2111 if (max_char >= 128)
2112 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002113 }
2114 else if (kind == PyUnicode_2BYTE_KIND) {
2115 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002116 max_char = ucs2lib_find_max_char(u, u + len);
2117 if (max_char >= 256)
2118 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 }
2120 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002121 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002123 max_char = ucs4lib_find_max_char(u, u + len);
2124 if (max_char >= 0x10000)
2125 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002126 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002127 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002128 if (copy != NULL)
2129 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 Py_DECREF(unicode);
2131 *p_unicode = copy;
2132}
2133
Victor Stinner034f6cf2011-09-30 02:26:44 +02002134PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002135_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002136{
Victor Stinner87af4f22011-11-21 23:03:47 +01002137 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002138 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002139
Victor Stinner034f6cf2011-09-30 02:26:44 +02002140 if (!PyUnicode_Check(unicode)) {
2141 PyErr_BadInternalCall();
2142 return NULL;
2143 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002144 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002145 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146
Victor Stinner87af4f22011-11-21 23:03:47 +01002147 length = PyUnicode_GET_LENGTH(unicode);
2148 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149 if (!copy)
2150 return NULL;
2151 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2152
Victor Stinner87af4f22011-11-21 23:03:47 +01002153 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2154 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002155 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002157}
2158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159
Victor Stinnerbc603d12011-10-02 01:00:40 +02002160/* Widen Unicode objects to larger buffers. Don't write terminating null
2161 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162
2163void*
2164_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2165{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002166 Py_ssize_t len;
2167 void *result;
2168 unsigned int skind;
2169
Benjamin Petersonbac79492012-01-14 13:34:47 -05002170 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002171 return NULL;
2172
2173 len = PyUnicode_GET_LENGTH(s);
2174 skind = PyUnicode_KIND(s);
2175 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002176 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 return NULL;
2178 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002179 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002180 case PyUnicode_2BYTE_KIND:
2181 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2182 if (!result)
2183 return PyErr_NoMemory();
2184 assert(skind == PyUnicode_1BYTE_KIND);
2185 _PyUnicode_CONVERT_BYTES(
2186 Py_UCS1, Py_UCS2,
2187 PyUnicode_1BYTE_DATA(s),
2188 PyUnicode_1BYTE_DATA(s) + len,
2189 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002191 case PyUnicode_4BYTE_KIND:
2192 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2193 if (!result)
2194 return PyErr_NoMemory();
2195 if (skind == PyUnicode_2BYTE_KIND) {
2196 _PyUnicode_CONVERT_BYTES(
2197 Py_UCS2, Py_UCS4,
2198 PyUnicode_2BYTE_DATA(s),
2199 PyUnicode_2BYTE_DATA(s) + len,
2200 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002202 else {
2203 assert(skind == PyUnicode_1BYTE_KIND);
2204 _PyUnicode_CONVERT_BYTES(
2205 Py_UCS1, Py_UCS4,
2206 PyUnicode_1BYTE_DATA(s),
2207 PyUnicode_1BYTE_DATA(s) + len,
2208 result);
2209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002211 default:
2212 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 }
Victor Stinner01698042011-10-04 00:04:26 +02002214 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 return NULL;
2216}
2217
2218static Py_UCS4*
2219as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2220 int copy_null)
2221{
2222 int kind;
2223 void *data;
2224 Py_ssize_t len, targetlen;
2225 if (PyUnicode_READY(string) == -1)
2226 return NULL;
2227 kind = PyUnicode_KIND(string);
2228 data = PyUnicode_DATA(string);
2229 len = PyUnicode_GET_LENGTH(string);
2230 targetlen = len;
2231 if (copy_null)
2232 targetlen++;
2233 if (!target) {
2234 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2235 PyErr_NoMemory();
2236 return NULL;
2237 }
2238 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2239 if (!target) {
2240 PyErr_NoMemory();
2241 return NULL;
2242 }
2243 }
2244 else {
2245 if (targetsize < targetlen) {
2246 PyErr_Format(PyExc_SystemError,
2247 "string is longer than the buffer");
2248 if (copy_null && 0 < targetsize)
2249 target[0] = 0;
2250 return NULL;
2251 }
2252 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002253 if (kind == PyUnicode_1BYTE_KIND) {
2254 Py_UCS1 *start = (Py_UCS1 *) data;
2255 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002257 else if (kind == PyUnicode_2BYTE_KIND) {
2258 Py_UCS2 *start = (Py_UCS2 *) data;
2259 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2260 }
2261 else {
2262 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 if (copy_null)
2266 target[len] = 0;
2267 return target;
2268}
2269
2270Py_UCS4*
2271PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2272 int copy_null)
2273{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002274 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 PyErr_BadInternalCall();
2276 return NULL;
2277 }
2278 return as_ucs4(string, target, targetsize, copy_null);
2279}
2280
2281Py_UCS4*
2282PyUnicode_AsUCS4Copy(PyObject *string)
2283{
2284 return as_ucs4(string, NULL, 0, 1);
2285}
2286
2287#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002288
Alexander Belopolsky40018472011-02-26 01:02:56 +00002289PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002290PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002294 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002295 PyErr_BadInternalCall();
2296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 }
2298
Martin v. Löwis790465f2008-04-05 20:41:37 +00002299 if (size == -1) {
2300 size = wcslen(w);
2301 }
2302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002303 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304}
2305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002307
Walter Dörwald346737f2007-05-31 10:44:43 +00002308static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002309makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002310 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002311{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002312 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002313 if (longflag)
2314 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002315 else if (longlongflag) {
2316 /* longlongflag should only ever be nonzero on machines with
2317 HAVE_LONG_LONG defined */
2318#ifdef HAVE_LONG_LONG
2319 char *f = PY_FORMAT_LONG_LONG;
2320 while (*f)
2321 *fmt++ = *f++;
2322#else
2323 /* we shouldn't ever get here */
2324 assert(0);
2325 *fmt++ = 'l';
2326#endif
2327 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002328 else if (size_tflag) {
2329 char *f = PY_FORMAT_SIZE_T;
2330 while (*f)
2331 *fmt++ = *f++;
2332 }
2333 *fmt++ = c;
2334 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002335}
2336
Victor Stinner15a11362012-10-06 23:48:20 +02002337/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002338 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2339 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2340#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002341
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002342static int
2343unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2344 Py_ssize_t width, Py_ssize_t precision)
2345{
2346 Py_ssize_t length, fill, arglen;
2347 Py_UCS4 maxchar;
2348
2349 if (PyUnicode_READY(str) == -1)
2350 return -1;
2351
2352 length = PyUnicode_GET_LENGTH(str);
2353 if ((precision == -1 || precision >= length)
2354 && width <= length)
2355 return _PyUnicodeWriter_WriteStr(writer, str);
2356
2357 if (precision != -1)
2358 length = Py_MIN(precision, length);
2359
2360 arglen = Py_MAX(length, width);
2361 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2362 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2363 else
2364 maxchar = writer->maxchar;
2365
2366 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2367 return -1;
2368
2369 if (width > length) {
2370 fill = width - length;
2371 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2372 return -1;
2373 writer->pos += fill;
2374 }
2375
2376 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2377 str, 0, length);
2378 writer->pos += length;
2379 return 0;
2380}
2381
2382static int
2383unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2384 Py_ssize_t width, Py_ssize_t precision)
2385{
2386 /* UTF-8 */
2387 Py_ssize_t length;
2388 PyObject *unicode;
2389 int res;
2390
2391 length = strlen(str);
2392 if (precision != -1)
2393 length = Py_MIN(length, precision);
2394 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2395 if (unicode == NULL)
2396 return -1;
2397
2398 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2399 Py_DECREF(unicode);
2400 return res;
2401}
2402
Victor Stinner96865452011-03-01 23:44:09 +00002403static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002404unicode_fromformat_arg(_PyUnicodeWriter *writer,
2405 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002406{
Victor Stinnere215d962012-10-06 23:03:36 +02002407 const char *p;
2408 Py_ssize_t len;
2409 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002410 Py_ssize_t width;
2411 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002412 int longflag;
2413 int longlongflag;
2414 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002415 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002416
2417 p = f;
2418 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002419 zeropad = 0;
2420 if (*f == '0') {
2421 zeropad = 1;
2422 f++;
2423 }
Victor Stinner96865452011-03-01 23:44:09 +00002424
2425 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002426 width = -1;
2427 if (Py_ISDIGIT((unsigned)*f)) {
2428 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002429 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002430 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002431 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002432 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002433 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002434 return NULL;
2435 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002436 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002437 f++;
2438 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 }
2440 precision = -1;
2441 if (*f == '.') {
2442 f++;
2443 if (Py_ISDIGIT((unsigned)*f)) {
2444 precision = (*f - '0');
2445 f++;
2446 while (Py_ISDIGIT((unsigned)*f)) {
2447 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2448 PyErr_SetString(PyExc_ValueError,
2449 "precision too big");
2450 return NULL;
2451 }
2452 precision = (precision * 10) + (*f - '0');
2453 f++;
2454 }
2455 }
Victor Stinner96865452011-03-01 23:44:09 +00002456 if (*f == '%') {
2457 /* "%.3%s" => f points to "3" */
2458 f--;
2459 }
2460 }
2461 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002462 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002463 f--;
2464 }
Victor Stinner96865452011-03-01 23:44:09 +00002465
2466 /* Handle %ld, %lu, %lld and %llu. */
2467 longflag = 0;
2468 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002469 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002470 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002471 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002472 longflag = 1;
2473 ++f;
2474 }
2475#ifdef HAVE_LONG_LONG
2476 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002477 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002478 longlongflag = 1;
2479 f += 2;
2480 }
2481#endif
2482 }
2483 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002484 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002485 size_tflag = 1;
2486 ++f;
2487 }
Victor Stinnere215d962012-10-06 23:03:36 +02002488
2489 if (f[1] == '\0')
2490 writer->overallocate = 0;
2491
2492 switch (*f) {
2493 case 'c':
2494 {
2495 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002496 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002497 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002498 "character argument not in range(0x110000)");
2499 return NULL;
2500 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002501 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002502 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002503 break;
2504 }
2505
2506 case 'i':
2507 case 'd':
2508 case 'u':
2509 case 'x':
2510 {
2511 /* used by sprintf */
2512 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002513 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002514 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002515
2516 if (*f == 'u') {
2517 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2518
2519 if (longflag)
2520 len = sprintf(buffer, fmt,
2521 va_arg(*vargs, unsigned long));
2522#ifdef HAVE_LONG_LONG
2523 else if (longlongflag)
2524 len = sprintf(buffer, fmt,
2525 va_arg(*vargs, unsigned PY_LONG_LONG));
2526#endif
2527 else if (size_tflag)
2528 len = sprintf(buffer, fmt,
2529 va_arg(*vargs, size_t));
2530 else
2531 len = sprintf(buffer, fmt,
2532 va_arg(*vargs, unsigned int));
2533 }
2534 else if (*f == 'x') {
2535 makefmt(fmt, 0, 0, 0, 'x');
2536 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2537 }
2538 else {
2539 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2540
2541 if (longflag)
2542 len = sprintf(buffer, fmt,
2543 va_arg(*vargs, long));
2544#ifdef HAVE_LONG_LONG
2545 else if (longlongflag)
2546 len = sprintf(buffer, fmt,
2547 va_arg(*vargs, PY_LONG_LONG));
2548#endif
2549 else if (size_tflag)
2550 len = sprintf(buffer, fmt,
2551 va_arg(*vargs, Py_ssize_t));
2552 else
2553 len = sprintf(buffer, fmt,
2554 va_arg(*vargs, int));
2555 }
2556 assert(len >= 0);
2557
Victor Stinnere215d962012-10-06 23:03:36 +02002558 if (precision < len)
2559 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002560
2561 arglen = Py_MAX(precision, width);
2562 assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127);
2563 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2564 return NULL;
2565
Victor Stinnere215d962012-10-06 23:03:36 +02002566 if (width > precision) {
2567 Py_UCS4 fillchar;
2568 fill = width - precision;
2569 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002570 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2571 return NULL;
2572 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002573 }
Victor Stinner15a11362012-10-06 23:48:20 +02002574 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002575 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002576 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2577 return NULL;
2578 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002579 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580
2581 unicode_write_cstr(writer->buffer, writer->pos, buffer, len);
2582 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002583 break;
2584 }
2585
2586 case 'p':
2587 {
2588 char number[MAX_LONG_LONG_CHARS];
2589
2590 len = sprintf(number, "%p", va_arg(*vargs, void*));
2591 assert(len >= 0);
2592
2593 /* %p is ill-defined: ensure leading 0x. */
2594 if (number[1] == 'X')
2595 number[1] = 'x';
2596 else if (number[1] != 'x') {
2597 memmove(number + 2, number,
2598 strlen(number) + 1);
2599 number[0] = '0';
2600 number[1] = 'x';
2601 len += 2;
2602 }
2603
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127);
2605 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002606 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002607 unicode_write_cstr(writer->buffer, writer->pos, number, len);
2608 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002609 break;
2610 }
2611
2612 case 's':
2613 {
2614 /* UTF-8 */
2615 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002617 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002618 break;
2619 }
2620
2621 case 'U':
2622 {
2623 PyObject *obj = va_arg(*vargs, PyObject *);
2624 assert(obj && _PyUnicode_CHECK(obj));
2625
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002626 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002627 return NULL;
2628 break;
2629 }
2630
2631 case 'V':
2632 {
2633 PyObject *obj = va_arg(*vargs, PyObject *);
2634 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002635 if (obj) {
2636 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002637 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002638 return NULL;
2639 }
2640 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002641 assert(str != NULL);
2642 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002643 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002644 }
2645 break;
2646 }
2647
2648 case 'S':
2649 {
2650 PyObject *obj = va_arg(*vargs, PyObject *);
2651 PyObject *str;
2652 assert(obj);
2653 str = PyObject_Str(obj);
2654 if (!str)
2655 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002656 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002657 Py_DECREF(str);
2658 return NULL;
2659 }
2660 Py_DECREF(str);
2661 break;
2662 }
2663
2664 case 'R':
2665 {
2666 PyObject *obj = va_arg(*vargs, PyObject *);
2667 PyObject *repr;
2668 assert(obj);
2669 repr = PyObject_Repr(obj);
2670 if (!repr)
2671 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002672 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002673 Py_DECREF(repr);
2674 return NULL;
2675 }
2676 Py_DECREF(repr);
2677 break;
2678 }
2679
2680 case 'A':
2681 {
2682 PyObject *obj = va_arg(*vargs, PyObject *);
2683 PyObject *ascii;
2684 assert(obj);
2685 ascii = PyObject_ASCII(obj);
2686 if (!ascii)
2687 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002688 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002689 Py_DECREF(ascii);
2690 return NULL;
2691 }
2692 Py_DECREF(ascii);
2693 break;
2694 }
2695
2696 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002697 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002698 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002699 break;
2700
2701 default:
2702 /* if we stumble upon an unknown formatting code, copy the rest
2703 of the format string to the output string. (we cannot just
2704 skip the code, since there's no way to know what's in the
2705 argument list) */
2706 len = strlen(p);
2707 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2708 return NULL;
2709 f = p+len;
2710 return f;
2711 }
2712
2713 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002714 return f;
2715}
2716
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717PyObject *
2718PyUnicode_FromFormatV(const char *format, va_list vargs)
2719{
Victor Stinnere215d962012-10-06 23:03:36 +02002720 va_list vargs2;
2721 const char *f;
2722 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002723
Victor Stinner8f674cc2013-04-17 23:02:17 +02002724 _PyUnicodeWriter_Init(&writer);
2725 writer.min_length = strlen(format) + 100;
2726 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002727
2728 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2729 Copy it to be able to pass a reference to a subfunction. */
2730 Py_VA_COPY(vargs2, vargs);
2731
2732 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002733 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002734 f = unicode_fromformat_arg(&writer, f, &vargs2);
2735 if (f == NULL)
2736 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002739 const char *p;
2740 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002741
Victor Stinnere215d962012-10-06 23:03:36 +02002742 p = f;
2743 do
2744 {
2745 if ((unsigned char)*p > 127) {
2746 PyErr_Format(PyExc_ValueError,
2747 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2748 "string, got a non-ASCII byte: 0x%02x",
2749 (unsigned char)*p);
2750 return NULL;
2751 }
2752 p++;
2753 }
2754 while (*p != '\0' && *p != '%');
2755 len = p - f;
2756
2757 if (*p == '\0')
2758 writer.overallocate = 0;
2759 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2760 goto fail;
2761 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2762 writer.pos += len;
2763
2764 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002765 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 }
Victor Stinnere215d962012-10-06 23:03:36 +02002767 return _PyUnicodeWriter_Finish(&writer);
2768
2769 fail:
2770 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002771 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002772}
2773
Walter Dörwaldd2034312007-05-18 16:29:38 +00002774PyObject *
2775PyUnicode_FromFormat(const char *format, ...)
2776{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 PyObject* ret;
2778 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002779
2780#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002781 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002784#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002785 ret = PyUnicode_FromFormatV(format, vargs);
2786 va_end(vargs);
2787 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002788}
2789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002790#ifdef HAVE_WCHAR_H
2791
Victor Stinner5593d8a2010-10-02 11:11:27 +00002792/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2793 convert a Unicode object to a wide character string.
2794
Victor Stinnerd88d9832011-09-06 02:00:05 +02002795 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002796 character) required to convert the unicode object. Ignore size argument.
2797
Victor Stinnerd88d9832011-09-06 02:00:05 +02002798 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002799 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002800 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002801static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002802unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002803 wchar_t *w,
2804 Py_ssize_t size)
2805{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002806 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 const wchar_t *wstr;
2808
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002809 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 if (wstr == NULL)
2811 return -1;
2812
Victor Stinner5593d8a2010-10-02 11:11:27 +00002813 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814 if (size > res)
2815 size = res + 1;
2816 else
2817 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002819 return res;
2820 }
2821 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002822 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002823}
2824
2825Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002826PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002827 wchar_t *w,
2828 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829{
2830 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002831 PyErr_BadInternalCall();
2832 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002834 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835}
2836
Victor Stinner137c34c2010-09-29 10:25:54 +00002837wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002838PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002839 Py_ssize_t *size)
2840{
2841 wchar_t* buffer;
2842 Py_ssize_t buflen;
2843
2844 if (unicode == NULL) {
2845 PyErr_BadInternalCall();
2846 return NULL;
2847 }
2848
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002849 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002850 if (buflen == -1)
2851 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002852 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002853 PyErr_NoMemory();
2854 return NULL;
2855 }
2856
Victor Stinner137c34c2010-09-29 10:25:54 +00002857 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2858 if (buffer == NULL) {
2859 PyErr_NoMemory();
2860 return NULL;
2861 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002862 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002863 if (buflen == -1) {
2864 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002865 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002866 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002867 if (size != NULL)
2868 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002869 return buffer;
2870}
2871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002872#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873
Alexander Belopolsky40018472011-02-26 01:02:56 +00002874PyObject *
2875PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002877 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002878 void *data;
2879 int kind;
2880
Victor Stinner8faf8212011-12-08 22:14:11 +01002881 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002882 PyErr_SetString(PyExc_ValueError,
2883 "chr() arg not in range(0x110000)");
2884 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002885 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002886
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002887 if ((Py_UCS4)ordinal < 256)
2888 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 v = PyUnicode_New(1, ordinal);
2891 if (v == NULL)
2892 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002893 kind = PyUnicode_KIND(v);
2894 data = PyUnicode_DATA(v);
2895 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002896 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002897 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002898}
2899
Alexander Belopolsky40018472011-02-26 01:02:56 +00002900PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002901PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002903 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002904 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002905 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002906 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002907 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002908 Py_INCREF(obj);
2909 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002910 }
2911 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 /* For a Unicode subtype that's not a Unicode object,
2913 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002914 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002915 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002916 PyErr_Format(PyExc_TypeError,
2917 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002918 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002919 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002920}
2921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002923PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002924 const char *encoding,
2925 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002926{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002927 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002928 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002929
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 PyErr_BadInternalCall();
2932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002934
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002935 /* Decoding bytes objects is the most common case and should be fast */
2936 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002937 if (PyBytes_GET_SIZE(obj) == 0)
2938 _Py_RETURN_UNICODE_EMPTY();
2939 v = PyUnicode_Decode(
2940 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2941 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002942 return v;
2943 }
2944
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002945 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002946 PyErr_SetString(PyExc_TypeError,
2947 "decoding str is not supported");
2948 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002950
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002951 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2952 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2953 PyErr_Format(PyExc_TypeError,
2954 "coercing to str: need bytes, bytearray "
2955 "or buffer-like object, %.80s found",
2956 Py_TYPE(obj)->tp_name);
2957 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002958 }
Tim Petersced69f82003-09-16 20:30:58 +00002959
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002960 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002961 PyBuffer_Release(&buffer);
2962 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002964
Serhiy Storchaka05997252013-01-26 12:14:02 +02002965 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002966 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002967 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968}
2969
Victor Stinner600d3be2010-06-10 12:00:55 +00002970/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002971 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2972 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002973int
2974_Py_normalize_encoding(const char *encoding,
2975 char *lower,
2976 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002978 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002979 char *l;
2980 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002981
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002982 if (encoding == NULL) {
2983 strcpy(lower, "utf-8");
2984 return 1;
2985 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002986 e = encoding;
2987 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002988 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002989 while (*e) {
2990 if (l == l_end)
2991 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002992 if (Py_ISUPPER(*e)) {
2993 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002994 }
2995 else if (*e == '_') {
2996 *l++ = '-';
2997 e++;
2998 }
2999 else {
3000 *l++ = *e++;
3001 }
3002 }
3003 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003004 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003005}
3006
Alexander Belopolsky40018472011-02-26 01:02:56 +00003007PyObject *
3008PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003009 Py_ssize_t size,
3010 const char *encoding,
3011 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003012{
3013 PyObject *buffer = NULL, *unicode;
3014 Py_buffer info;
3015 char lower[11]; /* Enough for any encoding shortcut */
3016
Fred Drakee4315f52000-05-09 19:53:39 +00003017 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003018 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003019 if ((strcmp(lower, "utf-8") == 0) ||
3020 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003021 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003022 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003023 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003024 (strcmp(lower, "iso-8859-1") == 0))
3025 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003026#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003027 else if (strcmp(lower, "mbcs") == 0)
3028 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003029#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003030 else if (strcmp(lower, "ascii") == 0)
3031 return PyUnicode_DecodeASCII(s, size, errors);
3032 else if (strcmp(lower, "utf-16") == 0)
3033 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3034 else if (strcmp(lower, "utf-32") == 0)
3035 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037
3038 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003039 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003040 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003041 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003042 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 if (buffer == NULL)
3044 goto onError;
3045 unicode = PyCodec_Decode(buffer, encoding, errors);
3046 if (unicode == NULL)
3047 goto onError;
3048 if (!PyUnicode_Check(unicode)) {
3049 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003050 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003051 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 Py_DECREF(unicode);
3053 goto onError;
3054 }
3055 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003056 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003057
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 Py_XDECREF(buffer);
3060 return NULL;
3061}
3062
Alexander Belopolsky40018472011-02-26 01:02:56 +00003063PyObject *
3064PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003065 const char *encoding,
3066 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003067{
3068 PyObject *v;
3069
3070 if (!PyUnicode_Check(unicode)) {
3071 PyErr_BadArgument();
3072 goto onError;
3073 }
3074
3075 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
3078 /* Decode via the codec registry */
3079 v = PyCodec_Decode(unicode, encoding, errors);
3080 if (v == NULL)
3081 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003082 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003083
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003085 return NULL;
3086}
3087
Alexander Belopolsky40018472011-02-26 01:02:56 +00003088PyObject *
3089PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003090 const char *encoding,
3091 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003092{
3093 PyObject *v;
3094
3095 if (!PyUnicode_Check(unicode)) {
3096 PyErr_BadArgument();
3097 goto onError;
3098 }
3099
3100 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003101 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003102
3103 /* Decode via the codec registry */
3104 v = PyCodec_Decode(unicode, encoding, errors);
3105 if (v == NULL)
3106 goto onError;
3107 if (!PyUnicode_Check(v)) {
3108 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003109 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003110 Py_TYPE(v)->tp_name);
3111 Py_DECREF(v);
3112 goto onError;
3113 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003114 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003115
Benjamin Peterson29060642009-01-31 22:14:21 +00003116 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003117 return NULL;
3118}
3119
Alexander Belopolsky40018472011-02-26 01:02:56 +00003120PyObject *
3121PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003122 Py_ssize_t size,
3123 const char *encoding,
3124 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125{
3126 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003127
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 unicode = PyUnicode_FromUnicode(s, size);
3129 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3132 Py_DECREF(unicode);
3133 return v;
3134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 const char *encoding,
3139 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003140{
3141 PyObject *v;
3142
3143 if (!PyUnicode_Check(unicode)) {
3144 PyErr_BadArgument();
3145 goto onError;
3146 }
3147
3148 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003150
3151 /* Encode via the codec registry */
3152 v = PyCodec_Encode(unicode, encoding, errors);
3153 if (v == NULL)
3154 goto onError;
3155 return v;
3156
Benjamin Peterson29060642009-01-31 22:14:21 +00003157 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003158 return NULL;
3159}
3160
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003161static size_t
3162wcstombs_errorpos(const wchar_t *wstr)
3163{
3164 size_t len;
3165#if SIZEOF_WCHAR_T == 2
3166 wchar_t buf[3];
3167#else
3168 wchar_t buf[2];
3169#endif
3170 char outbuf[MB_LEN_MAX];
3171 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003173#if SIZEOF_WCHAR_T == 2
3174 buf[2] = 0;
3175#else
3176 buf[1] = 0;
3177#endif
3178 start = wstr;
3179 while (*wstr != L'\0')
3180 {
3181 previous = wstr;
3182#if SIZEOF_WCHAR_T == 2
3183 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3184 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3185 {
3186 buf[0] = wstr[0];
3187 buf[1] = wstr[1];
3188 wstr += 2;
3189 }
3190 else {
3191 buf[0] = *wstr;
3192 buf[1] = 0;
3193 wstr++;
3194 }
3195#else
3196 buf[0] = *wstr;
3197 wstr++;
3198#endif
3199 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003200 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003201 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003202 }
3203
3204 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003205 return 0;
3206}
3207
Victor Stinner1b579672011-12-17 05:47:23 +01003208static int
3209locale_error_handler(const char *errors, int *surrogateescape)
3210{
3211 if (errors == NULL) {
3212 *surrogateescape = 0;
3213 return 0;
3214 }
3215
3216 if (strcmp(errors, "strict") == 0) {
3217 *surrogateescape = 0;
3218 return 0;
3219 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003220 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003221 *surrogateescape = 1;
3222 return 0;
3223 }
3224 PyErr_Format(PyExc_ValueError,
3225 "only 'strict' and 'surrogateescape' error handlers "
3226 "are supported, not '%s'",
3227 errors);
3228 return -1;
3229}
3230
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003231PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003232PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003233{
3234 Py_ssize_t wlen, wlen2;
3235 wchar_t *wstr;
3236 PyObject *bytes = NULL;
3237 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003238 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003239 PyObject *exc;
3240 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003241 int surrogateescape;
3242
3243 if (locale_error_handler(errors, &surrogateescape) < 0)
3244 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003245
3246 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3247 if (wstr == NULL)
3248 return NULL;
3249
3250 wlen2 = wcslen(wstr);
3251 if (wlen2 != wlen) {
3252 PyMem_Free(wstr);
3253 PyErr_SetString(PyExc_TypeError, "embedded null character");
3254 return NULL;
3255 }
3256
3257 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003258 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003259 char *str;
3260
3261 str = _Py_wchar2char(wstr, &error_pos);
3262 if (str == NULL) {
3263 if (error_pos == (size_t)-1) {
3264 PyErr_NoMemory();
3265 PyMem_Free(wstr);
3266 return NULL;
3267 }
3268 else {
3269 goto encode_error;
3270 }
3271 }
3272 PyMem_Free(wstr);
3273
3274 bytes = PyBytes_FromString(str);
3275 PyMem_Free(str);
3276 }
3277 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003278 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279 size_t len, len2;
3280
3281 len = wcstombs(NULL, wstr, 0);
3282 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003283 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003284 goto encode_error;
3285 }
3286
3287 bytes = PyBytes_FromStringAndSize(NULL, len);
3288 if (bytes == NULL) {
3289 PyMem_Free(wstr);
3290 return NULL;
3291 }
3292
3293 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3294 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003295 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003296 goto encode_error;
3297 }
3298 PyMem_Free(wstr);
3299 }
3300 return bytes;
3301
3302encode_error:
3303 errmsg = strerror(errno);
3304 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003305
3306 if (error_pos == (size_t)-1)
3307 error_pos = wcstombs_errorpos(wstr);
3308
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003309 PyMem_Free(wstr);
3310 Py_XDECREF(bytes);
3311
Victor Stinner2f197072011-12-17 07:08:30 +01003312 if (errmsg != NULL) {
3313 size_t errlen;
3314 wstr = _Py_char2wchar(errmsg, &errlen);
3315 if (wstr != NULL) {
3316 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003317 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003318 } else
3319 errmsg = NULL;
3320 }
3321 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003322 reason = PyUnicode_FromString(
3323 "wcstombs() encountered an unencodable "
3324 "wide character");
3325 if (reason == NULL)
3326 return NULL;
3327
3328 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3329 "locale", unicode,
3330 (Py_ssize_t)error_pos,
3331 (Py_ssize_t)(error_pos+1),
3332 reason);
3333 Py_DECREF(reason);
3334 if (exc != NULL) {
3335 PyCodec_StrictErrors(exc);
3336 Py_XDECREF(exc);
3337 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003338 return NULL;
3339}
3340
Victor Stinnerad158722010-10-27 00:25:46 +00003341PyObject *
3342PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003343{
Victor Stinner99b95382011-07-04 14:23:54 +02003344#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003345 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003346#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003347 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003348#else
Victor Stinner793b5312011-04-27 00:24:21 +02003349 PyInterpreterState *interp = PyThreadState_GET()->interp;
3350 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3351 cannot use it to encode and decode filenames before it is loaded. Load
3352 the Python codec requires to encode at least its own filename. Use the C
3353 version of the locale codec until the codec registry is initialized and
3354 the Python codec is loaded.
3355
3356 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3357 cannot only rely on it: check also interp->fscodec_initialized for
3358 subinterpreters. */
3359 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003360 return PyUnicode_AsEncodedString(unicode,
3361 Py_FileSystemDefaultEncoding,
3362 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003363 }
3364 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003365 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003366 }
Victor Stinnerad158722010-10-27 00:25:46 +00003367#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003368}
3369
Alexander Belopolsky40018472011-02-26 01:02:56 +00003370PyObject *
3371PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003372 const char *encoding,
3373 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374{
3375 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003376 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003377
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 if (!PyUnicode_Check(unicode)) {
3379 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381 }
Fred Drakee4315f52000-05-09 19:53:39 +00003382
Fred Drakee4315f52000-05-09 19:53:39 +00003383 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003384 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003385 if ((strcmp(lower, "utf-8") == 0) ||
3386 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003387 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003388 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003389 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003390 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003391 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003392 }
Victor Stinner37296e82010-06-10 13:36:23 +00003393 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003394 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003395 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003396 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003397#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003398 else if (strcmp(lower, "mbcs") == 0)
3399 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003400#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003401 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003402 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003403 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404
3405 /* Encode via the codec registry */
3406 v = PyCodec_Encode(unicode, encoding, errors);
3407 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003408 return NULL;
3409
3410 /* The normal path */
3411 if (PyBytes_Check(v))
3412 return v;
3413
3414 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003415 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003416 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003417 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003418
3419 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3420 "encoder %s returned bytearray instead of bytes",
3421 encoding);
3422 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003423 Py_DECREF(v);
3424 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003425 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003426
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003427 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3428 Py_DECREF(v);
3429 return b;
3430 }
3431
3432 PyErr_Format(PyExc_TypeError,
3433 "encoder did not return a bytes object (type=%.400s)",
3434 Py_TYPE(v)->tp_name);
3435 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003436 return NULL;
3437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
3440PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003443{
3444 PyObject *v;
3445
3446 if (!PyUnicode_Check(unicode)) {
3447 PyErr_BadArgument();
3448 goto onError;
3449 }
3450
3451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003453
3454 /* Encode via the codec registry */
3455 v = PyCodec_Encode(unicode, encoding, errors);
3456 if (v == NULL)
3457 goto onError;
3458 if (!PyUnicode_Check(v)) {
3459 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003460 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003461 Py_TYPE(v)->tp_name);
3462 Py_DECREF(v);
3463 goto onError;
3464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003466
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 return NULL;
3469}
3470
Victor Stinner2f197072011-12-17 07:08:30 +01003471static size_t
3472mbstowcs_errorpos(const char *str, size_t len)
3473{
3474#ifdef HAVE_MBRTOWC
3475 const char *start = str;
3476 mbstate_t mbs;
3477 size_t converted;
3478 wchar_t ch;
3479
3480 memset(&mbs, 0, sizeof mbs);
3481 while (len)
3482 {
3483 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3484 if (converted == 0)
3485 /* Reached end of string */
3486 break;
3487 if (converted == (size_t)-1 || converted == (size_t)-2) {
3488 /* Conversion error or incomplete character */
3489 return str - start;
3490 }
3491 else {
3492 str += converted;
3493 len -= converted;
3494 }
3495 }
3496 /* failed to find the undecodable byte sequence */
3497 return 0;
3498#endif
3499 return 0;
3500}
3501
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003502PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003503PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003504 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003505{
3506 wchar_t smallbuf[256];
3507 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3508 wchar_t *wstr;
3509 size_t wlen, wlen2;
3510 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003511 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003512 size_t error_pos;
3513 char *errmsg;
3514 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003515
3516 if (locale_error_handler(errors, &surrogateescape) < 0)
3517 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003518
3519 if (str[len] != '\0' || len != strlen(str)) {
3520 PyErr_SetString(PyExc_TypeError, "embedded null character");
3521 return NULL;
3522 }
3523
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003524 if (surrogateescape) {
3525 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003526 wstr = _Py_char2wchar(str, &wlen);
3527 if (wstr == NULL) {
3528 if (wlen == (size_t)-1)
3529 PyErr_NoMemory();
3530 else
3531 PyErr_SetFromErrno(PyExc_OSError);
3532 return NULL;
3533 }
3534
3535 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003536 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003537 }
3538 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003539 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003540#ifndef HAVE_BROKEN_MBSTOWCS
3541 wlen = mbstowcs(NULL, str, 0);
3542#else
3543 wlen = len;
3544#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003545 if (wlen == (size_t)-1)
3546 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547 if (wlen+1 <= smallbuf_len) {
3548 wstr = smallbuf;
3549 }
3550 else {
3551 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3552 return PyErr_NoMemory();
3553
3554 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3555 if (!wstr)
3556 return PyErr_NoMemory();
3557 }
3558
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003559 wlen2 = mbstowcs(wstr, str, wlen+1);
3560 if (wlen2 == (size_t)-1) {
3561 if (wstr != smallbuf)
3562 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003563 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003564 }
3565#ifdef HAVE_BROKEN_MBSTOWCS
3566 assert(wlen2 == wlen);
3567#endif
3568 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3569 if (wstr != smallbuf)
3570 PyMem_Free(wstr);
3571 }
3572 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003573
3574decode_error:
3575 errmsg = strerror(errno);
3576 assert(errmsg != NULL);
3577
3578 error_pos = mbstowcs_errorpos(str, len);
3579 if (errmsg != NULL) {
3580 size_t errlen;
3581 wstr = _Py_char2wchar(errmsg, &errlen);
3582 if (wstr != NULL) {
3583 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003584 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003585 } else
3586 errmsg = NULL;
3587 }
3588 if (errmsg == NULL)
3589 reason = PyUnicode_FromString(
3590 "mbstowcs() encountered an invalid multibyte sequence");
3591 if (reason == NULL)
3592 return NULL;
3593
3594 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3595 "locale", str, len,
3596 (Py_ssize_t)error_pos,
3597 (Py_ssize_t)(error_pos+1),
3598 reason);
3599 Py_DECREF(reason);
3600 if (exc != NULL) {
3601 PyCodec_StrictErrors(exc);
3602 Py_XDECREF(exc);
3603 }
3604 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003605}
3606
3607PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003608PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003609{
3610 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003611 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003612}
3613
3614
3615PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003616PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003617 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003618 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3619}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003620
Christian Heimes5894ba72007-11-04 11:43:14 +00003621PyObject*
3622PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3623{
Victor Stinner99b95382011-07-04 14:23:54 +02003624#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003625 return PyUnicode_DecodeMBCS(s, size, NULL);
3626#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003627 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003628#else
Victor Stinner793b5312011-04-27 00:24:21 +02003629 PyInterpreterState *interp = PyThreadState_GET()->interp;
3630 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3631 cannot use it to encode and decode filenames before it is loaded. Load
3632 the Python codec requires to encode at least its own filename. Use the C
3633 version of the locale codec until the codec registry is initialized and
3634 the Python codec is loaded.
3635
3636 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3637 cannot only rely on it: check also interp->fscodec_initialized for
3638 subinterpreters. */
3639 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003640 return PyUnicode_Decode(s, size,
3641 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003642 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003643 }
3644 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003645 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003646 }
Victor Stinnerad158722010-10-27 00:25:46 +00003647#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003648}
3649
Martin v. Löwis011e8422009-05-05 04:43:17 +00003650
3651int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003652_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003653{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003654 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003655
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003656 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003657 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003658 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3659 PyUnicode_GET_LENGTH(str), '\0', 1);
3660 if (pos == -1)
3661 return 0;
3662 else
3663 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003664}
3665
Antoine Pitrou13348842012-01-29 18:36:34 +01003666int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003667PyUnicode_FSConverter(PyObject* arg, void* addr)
3668{
3669 PyObject *output = NULL;
3670 Py_ssize_t size;
3671 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003672 if (arg == NULL) {
3673 Py_DECREF(*(PyObject**)addr);
3674 return 1;
3675 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003676 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003677 output = arg;
3678 Py_INCREF(output);
3679 }
3680 else {
3681 arg = PyUnicode_FromObject(arg);
3682 if (!arg)
3683 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003684 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003685 Py_DECREF(arg);
3686 if (!output)
3687 return 0;
3688 if (!PyBytes_Check(output)) {
3689 Py_DECREF(output);
3690 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3691 return 0;
3692 }
3693 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003694 size = PyBytes_GET_SIZE(output);
3695 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003696 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003697 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003698 Py_DECREF(output);
3699 return 0;
3700 }
3701 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003702 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003703}
3704
3705
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003706int
3707PyUnicode_FSDecoder(PyObject* arg, void* addr)
3708{
3709 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003710 if (arg == NULL) {
3711 Py_DECREF(*(PyObject**)addr);
3712 return 1;
3713 }
3714 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003715 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003717 output = arg;
3718 Py_INCREF(output);
3719 }
3720 else {
3721 arg = PyBytes_FromObject(arg);
3722 if (!arg)
3723 return 0;
3724 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3725 PyBytes_GET_SIZE(arg));
3726 Py_DECREF(arg);
3727 if (!output)
3728 return 0;
3729 if (!PyUnicode_Check(output)) {
3730 Py_DECREF(output);
3731 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3732 return 0;
3733 }
3734 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003735 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003736 Py_DECREF(output);
3737 return 0;
3738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003739 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003740 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003741 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3742 Py_DECREF(output);
3743 return 0;
3744 }
3745 *(PyObject**)addr = output;
3746 return Py_CLEANUP_SUPPORTED;
3747}
3748
3749
Martin v. Löwis5b222132007-06-10 09:51:05 +00003750char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003752{
Christian Heimesf3863112007-11-22 07:46:41 +00003753 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003754
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003755 if (!PyUnicode_Check(unicode)) {
3756 PyErr_BadArgument();
3757 return NULL;
3758 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003760 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003762 if (PyUnicode_UTF8(unicode) == NULL) {
3763 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3765 if (bytes == NULL)
3766 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003767 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3768 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 Py_DECREF(bytes);
3770 return NULL;
3771 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3773 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3774 PyBytes_AS_STRING(bytes),
3775 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 Py_DECREF(bytes);
3777 }
3778
3779 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003780 *psize = PyUnicode_UTF8_LENGTH(unicode);
3781 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003782}
3783
3784char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3788}
3789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790Py_UNICODE *
3791PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 const unsigned char *one_byte;
3794#if SIZEOF_WCHAR_T == 4
3795 const Py_UCS2 *two_bytes;
3796#else
3797 const Py_UCS4 *four_bytes;
3798 const Py_UCS4 *ucs4_end;
3799 Py_ssize_t num_surrogates;
3800#endif
3801 wchar_t *w;
3802 wchar_t *wchar_end;
3803
3804 if (!PyUnicode_Check(unicode)) {
3805 PyErr_BadArgument();
3806 return NULL;
3807 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003808 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 assert(_PyUnicode_KIND(unicode) != 0);
3811 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3816 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 num_surrogates = 0;
3818
3819 for (; four_bytes < ucs4_end; ++four_bytes) {
3820 if (*four_bytes > 0xFFFF)
3821 ++num_surrogates;
3822 }
3823
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003824 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3825 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3826 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 PyErr_NoMemory();
3828 return NULL;
3829 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 w = _PyUnicode_WSTR(unicode);
3833 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3834 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3836 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003837 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003839 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3840 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 }
3842 else
3843 *w = *four_bytes;
3844
3845 if (w > wchar_end) {
3846 assert(0 && "Miscalculated string end");
3847 }
3848 }
3849 *w = 0;
3850#else
3851 /* sizeof(wchar_t) == 4 */
3852 Py_FatalError("Impossible unicode object state, wstr and str "
3853 "should share memory already.");
3854 return NULL;
3855#endif
3856 }
3857 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003858 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3859 (_PyUnicode_LENGTH(unicode) + 1));
3860 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 PyErr_NoMemory();
3862 return NULL;
3863 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003864 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3865 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3866 w = _PyUnicode_WSTR(unicode);
3867 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003869 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3870 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 for (; w < wchar_end; ++one_byte, ++w)
3872 *w = *one_byte;
3873 /* null-terminate the wstr */
3874 *w = 0;
3875 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003876 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 for (; w < wchar_end; ++two_bytes, ++w)
3880 *w = *two_bytes;
3881 /* null-terminate the wstr */
3882 *w = 0;
3883#else
3884 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003885 PyObject_FREE(_PyUnicode_WSTR(unicode));
3886 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 Py_FatalError("Impossible unicode object state, wstr "
3888 "and str should share memory already.");
3889 return NULL;
3890#endif
3891 }
3892 else {
3893 assert(0 && "This should never happen.");
3894 }
3895 }
3896 }
3897 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003898 *size = PyUnicode_WSTR_LENGTH(unicode);
3899 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003900}
3901
Alexander Belopolsky40018472011-02-26 01:02:56 +00003902Py_UNICODE *
3903PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906}
3907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908
Alexander Belopolsky40018472011-02-26 01:02:56 +00003909Py_ssize_t
3910PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911{
3912 if (!PyUnicode_Check(unicode)) {
3913 PyErr_BadArgument();
3914 goto onError;
3915 }
3916 return PyUnicode_GET_SIZE(unicode);
3917
Benjamin Peterson29060642009-01-31 22:14:21 +00003918 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 return -1;
3920}
3921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922Py_ssize_t
3923PyUnicode_GetLength(PyObject *unicode)
3924{
Victor Stinner07621332012-06-16 04:53:46 +02003925 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 PyErr_BadArgument();
3927 return -1;
3928 }
Victor Stinner07621332012-06-16 04:53:46 +02003929 if (PyUnicode_READY(unicode) == -1)
3930 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 return PyUnicode_GET_LENGTH(unicode);
3932}
3933
3934Py_UCS4
3935PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3936{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003937 void *data;
3938 int kind;
3939
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003940 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3941 PyErr_BadArgument();
3942 return (Py_UCS4)-1;
3943 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003944 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003945 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 return (Py_UCS4)-1;
3947 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003948 data = PyUnicode_DATA(unicode);
3949 kind = PyUnicode_KIND(unicode);
3950 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951}
3952
3953int
3954PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3955{
3956 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003957 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958 return -1;
3959 }
Victor Stinner488fa492011-12-12 00:01:39 +01003960 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003961 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003962 PyErr_SetString(PyExc_IndexError, "string index out of range");
3963 return -1;
3964 }
Victor Stinner488fa492011-12-12 00:01:39 +01003965 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003966 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003967 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3968 PyErr_SetString(PyExc_ValueError, "character out of range");
3969 return -1;
3970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3972 index, ch);
3973 return 0;
3974}
3975
Alexander Belopolsky40018472011-02-26 01:02:56 +00003976const char *
3977PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003978{
Victor Stinner42cb4622010-09-01 19:39:01 +00003979 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003980}
3981
Victor Stinner554f3f02010-06-16 23:33:54 +00003982/* create or adjust a UnicodeDecodeError */
3983static void
3984make_decode_exception(PyObject **exceptionObject,
3985 const char *encoding,
3986 const char *input, Py_ssize_t length,
3987 Py_ssize_t startpos, Py_ssize_t endpos,
3988 const char *reason)
3989{
3990 if (*exceptionObject == NULL) {
3991 *exceptionObject = PyUnicodeDecodeError_Create(
3992 encoding, input, length, startpos, endpos, reason);
3993 }
3994 else {
3995 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3996 goto onError;
3997 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3998 goto onError;
3999 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4000 goto onError;
4001 }
4002 return;
4003
4004onError:
4005 Py_DECREF(*exceptionObject);
4006 *exceptionObject = NULL;
4007}
4008
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004009#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010/* error handling callback helper:
4011 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004012 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 and adjust various state variables.
4014 return 0 on success, -1 on error
4015*/
4016
Alexander Belopolsky40018472011-02-26 01:02:56 +00004017static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004018unicode_decode_call_errorhandler_wchar(
4019 const char *errors, PyObject **errorHandler,
4020 const char *encoding, const char *reason,
4021 const char **input, const char **inend, Py_ssize_t *startinpos,
4022 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4023 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004025 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026
4027 PyObject *restuple = NULL;
4028 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004029 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004030 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004031 Py_ssize_t requiredsize;
4032 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004033 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004034 wchar_t *repwstr;
4035 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004037 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4038 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004039
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 *errorHandler = PyCodec_LookupError(errors);
4042 if (*errorHandler == NULL)
4043 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044 }
4045
Victor Stinner554f3f02010-06-16 23:33:54 +00004046 make_decode_exception(exceptionObject,
4047 encoding,
4048 *input, *inend - *input,
4049 *startinpos, *endinpos,
4050 reason);
4051 if (*exceptionObject == NULL)
4052 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053
4054 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4055 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004058 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 }
4061 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004063
4064 /* Copy back the bytes variables, which might have been modified by the
4065 callback */
4066 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4067 if (!inputobj)
4068 goto onError;
4069 if (!PyBytes_Check(inputobj)) {
4070 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4071 }
4072 *input = PyBytes_AS_STRING(inputobj);
4073 insize = PyBytes_GET_SIZE(inputobj);
4074 *inend = *input + insize;
4075 /* we can DECREF safely, as the exception has another reference,
4076 so the object won't go away. */
4077 Py_DECREF(inputobj);
4078
4079 if (newpos<0)
4080 newpos = insize+newpos;
4081 if (newpos<0 || newpos>insize) {
4082 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4083 goto onError;
4084 }
4085
4086 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4087 if (repwstr == NULL)
4088 goto onError;
4089 /* need more space? (at least enough for what we
4090 have+the replacement+the rest of the string (starting
4091 at the new input position), so we won't have to check space
4092 when there are no errors in the rest of the string) */
4093 requiredsize = *outpos + repwlen + insize-newpos;
4094 if (requiredsize > outsize) {
4095 if (requiredsize < 2*outsize)
4096 requiredsize = 2*outsize;
4097 if (unicode_resize(output, requiredsize) < 0)
4098 goto onError;
4099 }
4100 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4101 *outpos += repwlen;
4102
4103 *endinpos = newpos;
4104 *inptr = *input + newpos;
4105
4106 /* we made it! */
4107 Py_XDECREF(restuple);
4108 return 0;
4109
4110 onError:
4111 Py_XDECREF(restuple);
4112 return -1;
4113}
4114#endif /* HAVE_MBCS */
4115
4116static int
4117unicode_decode_call_errorhandler_writer(
4118 const char *errors, PyObject **errorHandler,
4119 const char *encoding, const char *reason,
4120 const char **input, const char **inend, Py_ssize_t *startinpos,
4121 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4122 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4123{
4124 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4125
4126 PyObject *restuple = NULL;
4127 PyObject *repunicode = NULL;
4128 Py_ssize_t insize;
4129 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004130 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004131 PyObject *inputobj = NULL;
4132
4133 if (*errorHandler == NULL) {
4134 *errorHandler = PyCodec_LookupError(errors);
4135 if (*errorHandler == NULL)
4136 goto onError;
4137 }
4138
4139 make_decode_exception(exceptionObject,
4140 encoding,
4141 *input, *inend - *input,
4142 *startinpos, *endinpos,
4143 reason);
4144 if (*exceptionObject == NULL)
4145 goto onError;
4146
4147 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4148 if (restuple == NULL)
4149 goto onError;
4150 if (!PyTuple_Check(restuple)) {
4151 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4152 goto onError;
4153 }
4154 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004155 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004156
4157 /* Copy back the bytes variables, which might have been modified by the
4158 callback */
4159 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4160 if (!inputobj)
4161 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004162 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004164 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004165 *input = PyBytes_AS_STRING(inputobj);
4166 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004167 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004168 /* we can DECREF safely, as the exception has another reference,
4169 so the object won't go away. */
4170 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004171
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004174 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4176 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004177 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178
Victor Stinner8f674cc2013-04-17 23:02:17 +02004179 if (PyUnicode_READY(repunicode) < 0)
4180 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004181 replen = PyUnicode_GET_LENGTH(repunicode);
4182 writer->min_length += replen;
4183 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004184 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004185 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004186 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004189 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004192 Py_XDECREF(restuple);
4193 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004197 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198}
4199
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004200/* --- UTF-7 Codec -------------------------------------------------------- */
4201
Antoine Pitrou244651a2009-05-04 18:56:13 +00004202/* See RFC2152 for details. We encode conservatively and decode liberally. */
4203
4204/* Three simple macros defining base-64. */
4205
4206/* Is c a base-64 character? */
4207
4208#define IS_BASE64(c) \
4209 (((c) >= 'A' && (c) <= 'Z') || \
4210 ((c) >= 'a' && (c) <= 'z') || \
4211 ((c) >= '0' && (c) <= '9') || \
4212 (c) == '+' || (c) == '/')
4213
4214/* given that c is a base-64 character, what is its base-64 value? */
4215
4216#define FROM_BASE64(c) \
4217 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4218 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4219 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4220 (c) == '+' ? 62 : 63)
4221
4222/* What is the base-64 character of the bottom 6 bits of n? */
4223
4224#define TO_BASE64(n) \
4225 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4226
4227/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4228 * decoded as itself. We are permissive on decoding; the only ASCII
4229 * byte not decoding to itself is the + which begins a base64
4230 * string. */
4231
4232#define DECODE_DIRECT(c) \
4233 ((c) <= 127 && (c) != '+')
4234
4235/* The UTF-7 encoder treats ASCII characters differently according to
4236 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4237 * the above). See RFC2152. This array identifies these different
4238 * sets:
4239 * 0 : "Set D"
4240 * alphanumeric and '(),-./:?
4241 * 1 : "Set O"
4242 * !"#$%&*;<=>@[]^_`{|}
4243 * 2 : "whitespace"
4244 * ht nl cr sp
4245 * 3 : special (must be base64 encoded)
4246 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4247 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004248
Tim Petersced69f82003-09-16 20:30:58 +00004249static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004250char utf7_category[128] = {
4251/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4252 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4253/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4254 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4255/* sp ! " # $ % & ' ( ) * + , - . / */
4256 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4257/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4259/* @ A B C D E F G H I J K L M N O */
4260 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4261/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4263/* ` a b c d e f g h i j k l m n o */
4264 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4265/* p q r s t u v w x y z { | } ~ del */
4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004267};
4268
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269/* ENCODE_DIRECT: this character should be encoded as itself. The
4270 * answer depends on whether we are encoding set O as itself, and also
4271 * on whether we are encoding whitespace as itself. RFC2152 makes it
4272 * clear that the answers to these questions vary between
4273 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004274
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275#define ENCODE_DIRECT(c, directO, directWS) \
4276 ((c) < 128 && (c) > 0 && \
4277 ((utf7_category[(c)] == 0) || \
4278 (directWS && (utf7_category[(c)] == 2)) || \
4279 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280
Alexander Belopolsky40018472011-02-26 01:02:56 +00004281PyObject *
4282PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004283 Py_ssize_t size,
4284 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004286 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4287}
4288
Antoine Pitrou244651a2009-05-04 18:56:13 +00004289/* The decoder. The only state we preserve is our read position,
4290 * i.e. how many characters we have consumed. So if we end in the
4291 * middle of a shift sequence we have to back off the read position
4292 * and the output to the beginning of the sequence, otherwise we lose
4293 * all the shift state (seen bits, number of bits seen, high
4294 * surrogate). */
4295
Alexander Belopolsky40018472011-02-26 01:02:56 +00004296PyObject *
4297PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004298 Py_ssize_t size,
4299 const char *errors,
4300 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004301{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004303 Py_ssize_t startinpos;
4304 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004305 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 const char *errmsg = "";
4308 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004309 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310 unsigned int base64bits = 0;
4311 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004312 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313 PyObject *errorHandler = NULL;
4314 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004315
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004316 if (size == 0) {
4317 if (consumed)
4318 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004319 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004320 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004323 _PyUnicodeWriter_Init(&writer);
4324 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325
4326 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327 e = s + size;
4328
4329 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004330 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004332 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 if (inShift) { /* in a base-64 section */
4335 if (IS_BASE64(ch)) { /* consume a base-64 character */
4336 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4337 base64bits += 6;
4338 s++;
4339 if (base64bits >= 16) {
4340 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004341 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 base64bits -= 16;
4343 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004344 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 if (surrogate) {
4346 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004347 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4348 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004349 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004350 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004352 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 }
4354 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004355 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004356 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 }
4359 }
Victor Stinner551ac952011-11-29 22:58:13 +01004360 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 /* first surrogate */
4362 surrogate = outCh;
4363 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004365 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004366 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 }
4368 }
4369 }
4370 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371 inShift = 0;
4372 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004374 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004375 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004376 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004378 if (base64bits > 0) { /* left-over bits */
4379 if (base64bits >= 6) {
4380 /* We've seen at least one base-64 character */
4381 errmsg = "partial character in shift sequence";
4382 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 else {
4385 /* Some bits remain; they should be zero */
4386 if (base64buffer != 0) {
4387 errmsg = "non-zero padding bits in shift sequence";
4388 goto utf7Error;
4389 }
4390 }
4391 }
4392 if (ch != '-') {
4393 /* '-' is absorbed; other terminating
4394 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004395 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004396 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004397 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004398 }
4399 }
4400 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 s++; /* consume '+' */
4403 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004405 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004406 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407 }
4408 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004410 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004412 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004413 }
4414 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004417 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 else {
4421 startinpos = s-starts;
4422 s++;
4423 errmsg = "unexpected special character";
4424 goto utf7Error;
4425 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 errors, &errorHandler,
4431 "utf7", errmsg,
4432 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004433 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435 }
4436
Antoine Pitrou244651a2009-05-04 18:56:13 +00004437 /* end of string */
4438
4439 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4440 /* if we're in an inconsistent state, that's an error */
4441 if (surrogate ||
4442 (base64bits >= 6) ||
4443 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004446 errors, &errorHandler,
4447 "utf7", "unterminated shift sequence",
4448 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004449 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 goto onError;
4451 if (s < e)
4452 goto restart;
4453 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455
4456 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004457 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004459 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 }
4462 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004463 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004465 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 Py_XDECREF(errorHandler);
4468 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 Py_XDECREF(errorHandler);
4473 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004474 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475 return NULL;
4476}
4477
4478
Alexander Belopolsky40018472011-02-26 01:02:56 +00004479PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004480_PyUnicode_EncodeUTF7(PyObject *str,
4481 int base64SetO,
4482 int base64WhiteSpace,
4483 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004485 int kind;
4486 void *data;
4487 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004488 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004490 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 unsigned int base64bits = 0;
4492 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493 char * out;
4494 char * start;
4495
Benjamin Petersonbac79492012-01-14 13:34:47 -05004496 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004497 return NULL;
4498 kind = PyUnicode_KIND(str);
4499 data = PyUnicode_DATA(str);
4500 len = PyUnicode_GET_LENGTH(str);
4501
4502 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004505 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004506 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004507 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004508 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509 if (v == NULL)
4510 return NULL;
4511
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004512 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004513 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004514 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 if (inShift) {
4517 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4518 /* shifting out */
4519 if (base64bits) { /* output remaining bits */
4520 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4521 base64buffer = 0;
4522 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523 }
4524 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 /* Characters not in the BASE64 set implicitly unshift the sequence
4526 so no '-' is required, except if the character is itself a '-' */
4527 if (IS_BASE64(ch) || ch == '-') {
4528 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004529 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530 *out++ = (char) ch;
4531 }
4532 else {
4533 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004534 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 else { /* not in a shift sequence */
4537 if (ch == '+') {
4538 *out++ = '+';
4539 *out++ = '-';
4540 }
4541 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4542 *out++ = (char) ch;
4543 }
4544 else {
4545 *out++ = '+';
4546 inShift = 1;
4547 goto encode_char;
4548 }
4549 }
4550 continue;
4551encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004553 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004554
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 /* code first surrogate */
4556 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004557 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 while (base64bits >= 6) {
4559 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4560 base64bits -= 6;
4561 }
4562 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004563 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 base64bits += 16;
4566 base64buffer = (base64buffer << 16) | ch;
4567 while (base64bits >= 6) {
4568 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4569 base64bits -= 6;
4570 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004571 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 if (base64bits)
4573 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4574 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004576 if (_PyBytes_Resize(&v, out - start) < 0)
4577 return NULL;
4578 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004580PyObject *
4581PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4582 Py_ssize_t size,
4583 int base64SetO,
4584 int base64WhiteSpace,
4585 const char *errors)
4586{
4587 PyObject *result;
4588 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4589 if (tmp == NULL)
4590 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004591 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004592 base64WhiteSpace, errors);
4593 Py_DECREF(tmp);
4594 return result;
4595}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004596
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597#undef IS_BASE64
4598#undef FROM_BASE64
4599#undef TO_BASE64
4600#undef DECODE_DIRECT
4601#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603/* --- UTF-8 Codec -------------------------------------------------------- */
4604
Alexander Belopolsky40018472011-02-26 01:02:56 +00004605PyObject *
4606PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004607 Py_ssize_t size,
4608 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609{
Walter Dörwald69652032004-09-07 20:24:22 +00004610 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4611}
4612
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004613#include "stringlib/asciilib.h"
4614#include "stringlib/codecs.h"
4615#include "stringlib/undef.h"
4616
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004617#include "stringlib/ucs1lib.h"
4618#include "stringlib/codecs.h"
4619#include "stringlib/undef.h"
4620
4621#include "stringlib/ucs2lib.h"
4622#include "stringlib/codecs.h"
4623#include "stringlib/undef.h"
4624
4625#include "stringlib/ucs4lib.h"
4626#include "stringlib/codecs.h"
4627#include "stringlib/undef.h"
4628
Antoine Pitrouab868312009-01-10 15:40:25 +00004629/* Mask to quickly check whether a C 'long' contains a
4630 non-ASCII, UTF8-encoded char. */
4631#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004632# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004633#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004634# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004635#else
4636# error C 'long' size should be either 4 or 8!
4637#endif
4638
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004639static Py_ssize_t
4640ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004641{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004642 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004643 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004644
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004645 /*
4646 * Issue #17237: m68k is a bit different from most architectures in
4647 * that objects do not use "natural alignment" - for example, int and
4648 * long are only aligned at 2-byte boundaries. Therefore the assert()
4649 * won't work; also, tests have shown that skipping the "optimised
4650 * version" will even speed up m68k.
4651 */
4652#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004653#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004654 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4655 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004656 /* Fast path, see in STRINGLIB(utf8_decode) for
4657 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004658 /* Help allocation */
4659 const char *_p = p;
4660 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004661 while (_p < aligned_end) {
4662 unsigned long value = *(const unsigned long *) _p;
4663 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665 *((unsigned long *)q) = value;
4666 _p += SIZEOF_LONG;
4667 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004668 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004669 p = _p;
4670 while (p < end) {
4671 if ((unsigned char)*p & 0x80)
4672 break;
4673 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004675 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004678#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004679 while (p < end) {
4680 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4681 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004682 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004683 /* Help allocation */
4684 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685 while (_p < aligned_end) {
4686 unsigned long value = *(unsigned long *) _p;
4687 if (value & ASCII_CHAR_MASK)
4688 break;
4689 _p += SIZEOF_LONG;
4690 }
4691 p = _p;
4692 if (_p == end)
4693 break;
4694 }
4695 if ((unsigned char)*p & 0x80)
4696 break;
4697 ++p;
4698 }
4699 memcpy(dest, start, p - start);
4700 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701}
Antoine Pitrouab868312009-01-10 15:40:25 +00004702
Victor Stinner785938e2011-12-11 20:09:03 +01004703PyObject *
4704PyUnicode_DecodeUTF8Stateful(const char *s,
4705 Py_ssize_t size,
4706 const char *errors,
4707 Py_ssize_t *consumed)
4708{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004709 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004710 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004712
4713 Py_ssize_t startinpos;
4714 Py_ssize_t endinpos;
4715 const char *errmsg = "";
4716 PyObject *errorHandler = NULL;
4717 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004718
4719 if (size == 0) {
4720 if (consumed)
4721 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004722 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004723 }
4724
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004725 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4726 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004727 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728 *consumed = 1;
4729 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004730 }
4731
Victor Stinner8f674cc2013-04-17 23:02:17 +02004732 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004733 writer.min_length = size;
4734 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004735 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004736
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004737 writer.pos = ascii_decode(s, end, writer.data);
4738 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 while (s < end) {
4740 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004741 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004742 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004743 if (PyUnicode_IS_ASCII(writer.buffer))
4744 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004747 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004748 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749 } else {
4750 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004751 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004752 }
4753
4754 switch (ch) {
4755 case 0:
4756 if (s == end || consumed)
4757 goto End;
4758 errmsg = "unexpected end of data";
4759 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004760 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 break;
4762 case 1:
4763 errmsg = "invalid start byte";
4764 startinpos = s - starts;
4765 endinpos = startinpos + 1;
4766 break;
4767 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004768 case 3:
4769 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 errmsg = "invalid continuation byte";
4771 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004772 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004773 break;
4774 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004775 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776 goto onError;
4777 continue;
4778 }
4779
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004780 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004781 errors, &errorHandler,
4782 "utf-8", errmsg,
4783 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004784 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004786 }
4787
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004789 if (consumed)
4790 *consumed = s - starts;
4791
4792 Py_XDECREF(errorHandler);
4793 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004794 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004795
4796onError:
4797 Py_XDECREF(errorHandler);
4798 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004799 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004800 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004801}
4802
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004803#ifdef __APPLE__
4804
4805/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004806 used to decode the command line arguments on Mac OS X.
4807
4808 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004809 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004810
4811wchar_t*
4812_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4813{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004814 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815 wchar_t *unicode;
4816 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004817
4818 /* Note: size will always be longer than the resulting Unicode
4819 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004820 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004822 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 if (!unicode)
4824 return NULL;
4825
4826 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004827 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004830 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004831#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004832 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004836 if (ch > 0xFF) {
4837#if SIZEOF_WCHAR_T == 4
4838 assert(0);
4839#else
4840 assert(Py_UNICODE_IS_SURROGATE(ch));
4841 /* compute and append the two surrogates: */
4842 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4843 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4844#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004845 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004846 else {
4847 if (!ch && s == e)
4848 break;
4849 /* surrogateescape */
4850 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4851 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004852 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854 return unicode;
4855}
4856
4857#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004859/* Primary internal function which creates utf8 encoded bytes objects.
4860
4861 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004862 and allocate exactly as much space needed at the end. Else allocate the
4863 maximum possible needed (4 result bytes per Unicode character), and return
4864 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004865*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004866PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004867_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868{
Victor Stinner6099a032011-12-18 14:22:26 +01004869 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004870 void *data;
4871 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873 if (!PyUnicode_Check(unicode)) {
4874 PyErr_BadArgument();
4875 return NULL;
4876 }
4877
4878 if (PyUnicode_READY(unicode) == -1)
4879 return NULL;
4880
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004881 if (PyUnicode_UTF8(unicode))
4882 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4883 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004884
4885 kind = PyUnicode_KIND(unicode);
4886 data = PyUnicode_DATA(unicode);
4887 size = PyUnicode_GET_LENGTH(unicode);
4888
Benjamin Petersonead6b532011-12-20 17:23:42 -06004889 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004890 default:
4891 assert(0);
4892 case PyUnicode_1BYTE_KIND:
4893 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4894 assert(!PyUnicode_IS_ASCII(unicode));
4895 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4896 case PyUnicode_2BYTE_KIND:
4897 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4898 case PyUnicode_4BYTE_KIND:
4899 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901}
4902
Alexander Belopolsky40018472011-02-26 01:02:56 +00004903PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004904PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4905 Py_ssize_t size,
4906 const char *errors)
4907{
4908 PyObject *v, *unicode;
4909
4910 unicode = PyUnicode_FromUnicode(s, size);
4911 if (unicode == NULL)
4912 return NULL;
4913 v = _PyUnicode_AsUTF8String(unicode, errors);
4914 Py_DECREF(unicode);
4915 return v;
4916}
4917
4918PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004919PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004921 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922}
4923
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924/* --- UTF-32 Codec ------------------------------------------------------- */
4925
4926PyObject *
4927PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 Py_ssize_t size,
4929 const char *errors,
4930 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004931{
4932 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4933}
4934
4935PyObject *
4936PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 Py_ssize_t size,
4938 const char *errors,
4939 int *byteorder,
4940 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941{
4942 const char *starts = s;
4943 Py_ssize_t startinpos;
4944 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004945 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004946 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004947 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949 PyObject *errorHandler = NULL;
4950 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004951
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952 q = (unsigned char *)s;
4953 e = q + size;
4954
4955 if (byteorder)
4956 bo = *byteorder;
4957
4958 /* Check for BOM marks (U+FEFF) in the input and adjust current
4959 byte order setting accordingly. In native mode, the leading BOM
4960 mark is skipped, in all other modes, it is copied to the output
4961 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004962 if (bo == 0 && size >= 4) {
4963 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4964 if (bom == 0x0000FEFF) {
4965 bo = -1;
4966 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004968 else if (bom == 0xFFFE0000) {
4969 bo = 1;
4970 q += 4;
4971 }
4972 if (byteorder)
4973 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974 }
4975
Victor Stinnere64322e2012-10-30 23:12:47 +01004976 if (q == e) {
4977 if (consumed)
4978 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004979 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980 }
4981
Victor Stinnere64322e2012-10-30 23:12:47 +01004982#ifdef WORDS_BIGENDIAN
4983 le = bo < 0;
4984#else
4985 le = bo <= 0;
4986#endif
4987
Victor Stinner8f674cc2013-04-17 23:02:17 +02004988 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004989 writer.min_length = (e - q + 3) / 4;
4990 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004992
Victor Stinnere64322e2012-10-30 23:12:47 +01004993 while (1) {
4994 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004996
Victor Stinnere64322e2012-10-30 23:12:47 +01004997 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004998 enum PyUnicode_Kind kind = writer.kind;
4999 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005000 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005001 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005002 if (le) {
5003 do {
5004 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5005 if (ch > maxch)
5006 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005007 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 q += 4;
5009 } while (q <= last);
5010 }
5011 else {
5012 do {
5013 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5014 if (ch > maxch)
5015 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005016 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005017 q += 4;
5018 } while (q <= last);
5019 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005020 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005021 }
5022
5023 if (ch <= maxch) {
5024 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005026 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005028 startinpos = ((const char *)q) - starts;
5029 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005031 else {
5032 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005033 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 goto onError;
5035 q += 4;
5036 continue;
5037 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005039 startinpos = ((const char *)q) - starts;
5040 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005042
5043 /* The remaining input chars are ignored if the callback
5044 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005045 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 errors, &errorHandler,
5047 "utf32", errmsg,
5048 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005049 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 }
5052
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056 Py_XDECREF(errorHandler);
5057 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005058 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005061 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062 Py_XDECREF(errorHandler);
5063 Py_XDECREF(exc);
5064 return NULL;
5065}
5066
5067PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005068_PyUnicode_EncodeUTF32(PyObject *str,
5069 const char *errors,
5070 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005072 int kind;
5073 void *data;
5074 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005075 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005077 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005079#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 int iorder[] = {0, 1, 2, 3};
5081#else
5082 int iorder[] = {3, 2, 1, 0};
5083#endif
5084
Benjamin Peterson29060642009-01-31 22:14:21 +00005085#define STORECHAR(CH) \
5086 do { \
5087 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5088 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5089 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5090 p[iorder[0]] = (CH) & 0xff; \
5091 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092 } while(0)
5093
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005094 if (!PyUnicode_Check(str)) {
5095 PyErr_BadArgument();
5096 return NULL;
5097 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005098 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005099 return NULL;
5100 kind = PyUnicode_KIND(str);
5101 data = PyUnicode_DATA(str);
5102 len = PyUnicode_GET_LENGTH(str);
5103
5104 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005105 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005107 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108 if (v == NULL)
5109 return NULL;
5110
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005111 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005113 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005114 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005115 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116
5117 if (byteorder == -1) {
5118 /* force LE */
5119 iorder[0] = 0;
5120 iorder[1] = 1;
5121 iorder[2] = 2;
5122 iorder[3] = 3;
5123 }
5124 else if (byteorder == 1) {
5125 /* force BE */
5126 iorder[0] = 3;
5127 iorder[1] = 2;
5128 iorder[2] = 1;
5129 iorder[3] = 0;
5130 }
5131
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005132 for (i = 0; i < len; i++)
5133 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005134
5135 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005136 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137#undef STORECHAR
5138}
5139
Alexander Belopolsky40018472011-02-26 01:02:56 +00005140PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005141PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5142 Py_ssize_t size,
5143 const char *errors,
5144 int byteorder)
5145{
5146 PyObject *result;
5147 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5148 if (tmp == NULL)
5149 return NULL;
5150 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5151 Py_DECREF(tmp);
5152 return result;
5153}
5154
5155PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005156PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005157{
Victor Stinnerb960b342011-11-20 19:12:52 +01005158 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159}
5160
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161/* --- UTF-16 Codec ------------------------------------------------------- */
5162
Tim Peters772747b2001-08-09 22:21:55 +00005163PyObject *
5164PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 Py_ssize_t size,
5166 const char *errors,
5167 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168{
Walter Dörwald69652032004-09-07 20:24:22 +00005169 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5170}
5171
5172PyObject *
5173PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 Py_ssize_t size,
5175 const char *errors,
5176 int *byteorder,
5177 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005178{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005180 Py_ssize_t startinpos;
5181 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005182 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005183 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005184 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005185 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005186 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005187 PyObject *errorHandler = NULL;
5188 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189
Tim Peters772747b2001-08-09 22:21:55 +00005190 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005191 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192
5193 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005194 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005196 /* Check for BOM marks (U+FEFF) in the input and adjust current
5197 byte order setting accordingly. In native mode, the leading BOM
5198 mark is skipped, in all other modes, it is copied to the output
5199 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005200 if (bo == 0 && size >= 2) {
5201 const Py_UCS4 bom = (q[1] << 8) | q[0];
5202 if (bom == 0xFEFF) {
5203 q += 2;
5204 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005206 else if (bom == 0xFFFE) {
5207 q += 2;
5208 bo = 1;
5209 }
5210 if (byteorder)
5211 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
Antoine Pitrou63065d72012-05-15 23:48:04 +02005214 if (q == e) {
5215 if (consumed)
5216 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005217 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005218 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005219
Christian Heimes743e0cd2012-10-17 23:52:17 +02005220#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005221 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005222#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005223 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005224#endif
Tim Peters772747b2001-08-09 22:21:55 +00005225
Antoine Pitrou63065d72012-05-15 23:48:04 +02005226 /* Note: size will always be longer than the resulting Unicode
5227 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005228 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005229 writer.min_length = (e - q + 1) / 2;
5230 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005231 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005232
Antoine Pitrou63065d72012-05-15 23:48:04 +02005233 while (1) {
5234 Py_UCS4 ch = 0;
5235 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005236 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005237 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005238 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005239 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005240 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005241 native_ordering);
5242 else
5243 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005245 native_ordering);
5246 } else if (kind == PyUnicode_2BYTE_KIND) {
5247 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005248 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005249 native_ordering);
5250 } else {
5251 assert(kind == PyUnicode_4BYTE_KIND);
5252 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005253 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005254 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005255 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005257
Antoine Pitrou63065d72012-05-15 23:48:04 +02005258 switch (ch)
5259 {
5260 case 0:
5261 /* remaining byte at the end? (size should be even) */
5262 if (q == e || consumed)
5263 goto End;
5264 errmsg = "truncated data";
5265 startinpos = ((const char *)q) - starts;
5266 endinpos = ((const char *)e) - starts;
5267 break;
5268 /* The remaining input chars are ignored if the callback
5269 chooses to skip the input */
5270 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005271 q -= 2;
5272 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005273 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005274 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005275 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005276 endinpos = ((const char *)e) - starts;
5277 break;
5278 case 2:
5279 errmsg = "illegal encoding";
5280 startinpos = ((const char *)q) - 2 - starts;
5281 endinpos = startinpos + 2;
5282 break;
5283 case 3:
5284 errmsg = "illegal UTF-16 surrogate";
5285 startinpos = ((const char *)q) - 4 - starts;
5286 endinpos = startinpos + 2;
5287 break;
5288 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005289 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005290 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 continue;
5292 }
5293
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005294 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005295 errors,
5296 &errorHandler,
5297 "utf16", errmsg,
5298 &starts,
5299 (const char **)&e,
5300 &startinpos,
5301 &endinpos,
5302 &exc,
5303 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005304 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 }
5307
Antoine Pitrou63065d72012-05-15 23:48:04 +02005308End:
Walter Dörwald69652032004-09-07 20:24:22 +00005309 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005312 Py_XDECREF(errorHandler);
5313 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005314 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005317 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005318 Py_XDECREF(errorHandler);
5319 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320 return NULL;
5321}
5322
Tim Peters772747b2001-08-09 22:21:55 +00005323PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005324_PyUnicode_EncodeUTF16(PyObject *str,
5325 const char *errors,
5326 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005328 enum PyUnicode_Kind kind;
5329 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005330 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005331 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005332 unsigned short *out;
5333 Py_ssize_t bytesize;
5334 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005335#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005336 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005337#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005338 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005339#endif
5340
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005341 if (!PyUnicode_Check(str)) {
5342 PyErr_BadArgument();
5343 return NULL;
5344 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005345 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005346 return NULL;
5347 kind = PyUnicode_KIND(str);
5348 data = PyUnicode_DATA(str);
5349 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005350
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005351 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005352 if (kind == PyUnicode_4BYTE_KIND) {
5353 const Py_UCS4 *in = (const Py_UCS4 *)data;
5354 const Py_UCS4 *end = in + len;
5355 while (in < end)
5356 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005357 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005358 }
5359 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005361 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005362 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 if (v == NULL)
5364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005366 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005367 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005368 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005370 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005371 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005372 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005373
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005374 switch (kind) {
5375 case PyUnicode_1BYTE_KIND: {
5376 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5377 break;
Tim Peters772747b2001-08-09 22:21:55 +00005378 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005379 case PyUnicode_2BYTE_KIND: {
5380 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5381 break;
Tim Peters772747b2001-08-09 22:21:55 +00005382 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005383 case PyUnicode_4BYTE_KIND: {
5384 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5385 break;
5386 }
5387 default:
5388 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005389 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005390
5391 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005392 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393}
5394
Alexander Belopolsky40018472011-02-26 01:02:56 +00005395PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5397 Py_ssize_t size,
5398 const char *errors,
5399 int byteorder)
5400{
5401 PyObject *result;
5402 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5403 if (tmp == NULL)
5404 return NULL;
5405 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5406 Py_DECREF(tmp);
5407 return result;
5408}
5409
5410PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005411PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005413 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414}
5415
5416/* --- Unicode Escape Codec ----------------------------------------------- */
5417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005418/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5419 if all the escapes in the string make it still a valid ASCII string.
5420 Returns -1 if any escapes were found which cause the string to
5421 pop out of ASCII range. Otherwise returns the length of the
5422 required buffer to hold the string.
5423 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005424static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005425length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5426{
5427 const unsigned char *p = (const unsigned char *)s;
5428 const unsigned char *end = p + size;
5429 Py_ssize_t length = 0;
5430
5431 if (size < 0)
5432 return -1;
5433
5434 for (; p < end; ++p) {
5435 if (*p > 127) {
5436 /* Non-ASCII */
5437 return -1;
5438 }
5439 else if (*p != '\\') {
5440 /* Normal character */
5441 ++length;
5442 }
5443 else {
5444 /* Backslash-escape, check next char */
5445 ++p;
5446 /* Escape sequence reaches till end of string or
5447 non-ASCII follow-up. */
5448 if (p >= end || *p > 127)
5449 return -1;
5450 switch (*p) {
5451 case '\n':
5452 /* backslash + \n result in zero characters */
5453 break;
5454 case '\\': case '\'': case '\"':
5455 case 'b': case 'f': case 't':
5456 case 'n': case 'r': case 'v': case 'a':
5457 ++length;
5458 break;
5459 case '0': case '1': case '2': case '3':
5460 case '4': case '5': case '6': case '7':
5461 case 'x': case 'u': case 'U': case 'N':
5462 /* these do not guarantee ASCII characters */
5463 return -1;
5464 default:
5465 /* count the backslash + the other character */
5466 length += 2;
5467 }
5468 }
5469 }
5470 return length;
5471}
5472
Fredrik Lundh06d12682001-01-24 07:59:11 +00005473static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005474
Alexander Belopolsky40018472011-02-26 01:02:56 +00005475PyObject *
5476PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005477 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005478 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481 Py_ssize_t startinpos;
5482 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005483 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005485 char* message;
5486 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487 PyObject *errorHandler = NULL;
5488 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005489 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005490
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005491 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005492 if (len == 0)
5493 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005494
5495 /* After length_of_escaped_ascii_string() there are two alternatives,
5496 either the string is pure ASCII with named escapes like \n, etc.
5497 and we determined it's exact size (common case)
5498 or it contains \x, \u, ... escape sequences. then we create a
5499 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005500 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005501 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005502 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005503 }
5504 else {
5505 /* Escaped strings will always be longer than the resulting
5506 Unicode string, so we start with size here and then reduce the
5507 length after conversion to the true value.
5508 (but if the error callback returns a long replacement string
5509 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005510 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005511 }
5512
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005514 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005516
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 while (s < end) {
5518 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005519 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005520 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521
5522 /* Non-escape characters are interpreted as Unicode ordinals */
5523 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005524 x = (unsigned char)*s;
5525 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005526 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005527 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 continue;
5529 }
5530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 /* \ - Escapes */
5533 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005534 c = *s++;
5535 if (s > end)
5536 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005537
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005538 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005541#define WRITECHAR(ch) \
5542 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005543 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005544 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005545 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005548 case '\\': WRITECHAR('\\'); break;
5549 case '\'': WRITECHAR('\''); break;
5550 case '\"': WRITECHAR('\"'); break;
5551 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005552 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005553 case 'f': WRITECHAR('\014'); break;
5554 case 't': WRITECHAR('\t'); break;
5555 case 'n': WRITECHAR('\n'); break;
5556 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005558 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005559 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005560 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 case '0': case '1': case '2': case '3':
5564 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005565 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005566 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005567 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005568 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005569 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005571 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 break;
5573
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 /* hex escapes */
5575 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005577 digits = 2;
5578 message = "truncated \\xXX escape";
5579 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005583 digits = 4;
5584 message = "truncated \\uXXXX escape";
5585 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005588 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005589 digits = 8;
5590 message = "truncated \\UXXXXXXXX escape";
5591 hexescape:
5592 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005593 if (end - s < digits) {
5594 /* count only hex digits */
5595 for (; s < end; ++s) {
5596 c = (unsigned char)*s;
5597 if (!Py_ISXDIGIT(c))
5598 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005599 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005600 goto error;
5601 }
5602 for (; digits--; ++s) {
5603 c = (unsigned char)*s;
5604 if (!Py_ISXDIGIT(c))
5605 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005606 chr = (chr<<4) & ~0xF;
5607 if (c >= '0' && c <= '9')
5608 chr += c - '0';
5609 else if (c >= 'a' && c <= 'f')
5610 chr += 10 + c - 'a';
5611 else
5612 chr += 10 + c - 'A';
5613 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005614 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005615 /* _decoding_error will have already written into the
5616 target buffer. */
5617 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005618 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005619 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005620 message = "illegal Unicode character";
5621 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005622 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005623 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005624 break;
5625
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005627 case 'N':
5628 message = "malformed \\N character escape";
5629 if (ucnhash_CAPI == NULL) {
5630 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5632 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005633 if (ucnhash_CAPI == NULL)
5634 goto ucnhashError;
5635 }
5636 if (*s == '{') {
5637 const char *start = s+1;
5638 /* look for the closing brace */
5639 while (*s != '}' && s < end)
5640 s++;
5641 if (s > start && s < end && *s == '}') {
5642 /* found a name. look it up in the unicode database */
5643 message = "unknown Unicode character name";
5644 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005645 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005646 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005647 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005648 goto store;
5649 }
5650 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005651 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005652
5653 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005654 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 message = "\\ at end of string";
5656 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005657 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005658 }
5659 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005660 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005661 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005662 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005663 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005665 continue;
5666
5667 error:
5668 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005669 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005670 errors, &errorHandler,
5671 "unicodeescape", message,
5672 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005673 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005674 goto onError;
5675 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005677#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005679 Py_XDECREF(errorHandler);
5680 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005681 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005682
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005684 PyErr_SetString(
5685 PyExc_UnicodeError,
5686 "\\N escapes not supported (can't load unicodedata module)"
5687 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 Py_XDECREF(errorHandler);
5690 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005691 return NULL;
5692
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 Py_XDECREF(errorHandler);
5696 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 return NULL;
5698}
5699
5700/* Return a Unicode-Escape string version of the Unicode object.
5701
5702 If quotes is true, the string is enclosed in u"" or u'' quotes as
5703 appropriate.
5704
5705*/
5706
Alexander Belopolsky40018472011-02-26 01:02:56 +00005707PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005708PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005710 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005711 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005713 int kind;
5714 void *data;
5715 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716
Ezio Melottie7f90372012-10-05 03:33:31 +03005717 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005718 escape.
5719
Ezio Melottie7f90372012-10-05 03:33:31 +03005720 For UCS1 strings it's '\xxx', 4 bytes per source character.
5721 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5722 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005723 */
5724
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005725 if (!PyUnicode_Check(unicode)) {
5726 PyErr_BadArgument();
5727 return NULL;
5728 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005729 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 return NULL;
5731 len = PyUnicode_GET_LENGTH(unicode);
5732 kind = PyUnicode_KIND(unicode);
5733 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005734 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5736 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5737 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5738 }
5739
5740 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005741 return PyBytes_FromStringAndSize(NULL, 0);
5742
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005743 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005745
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005746 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 if (repr == NULL)
5751 return NULL;
5752
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005753 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005756 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005757
Walter Dörwald79e913e2007-05-12 11:08:06 +00005758 /* Escape backslashes */
5759 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 *p++ = '\\';
5761 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005762 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005763 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005764
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005765 /* Map 21-bit characters to '\U00xxxxxx' */
5766 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005767 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005768 *p++ = '\\';
5769 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005770 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5771 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5772 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5773 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5774 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5775 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5776 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5777 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005779 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005780
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005782 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 *p++ = '\\';
5784 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005785 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5786 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5787 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5788 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005790
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005791 /* Map special whitespace to '\t', \n', '\r' */
5792 else if (ch == '\t') {
5793 *p++ = '\\';
5794 *p++ = 't';
5795 }
5796 else if (ch == '\n') {
5797 *p++ = '\\';
5798 *p++ = 'n';
5799 }
5800 else if (ch == '\r') {
5801 *p++ = '\\';
5802 *p++ = 'r';
5803 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005804
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005805 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005806 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005808 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005809 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5810 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005811 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005812
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 /* Copy everything else as-is */
5814 else
5815 *p++ = (char) ch;
5816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005818 assert(p - PyBytes_AS_STRING(repr) > 0);
5819 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5820 return NULL;
5821 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822}
5823
Alexander Belopolsky40018472011-02-26 01:02:56 +00005824PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005825PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5826 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005828 PyObject *result;
5829 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5830 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005832 result = PyUnicode_AsUnicodeEscapeString(tmp);
5833 Py_DECREF(tmp);
5834 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835}
5836
5837/* --- Raw Unicode Escape Codec ------------------------------------------- */
5838
Alexander Belopolsky40018472011-02-26 01:02:56 +00005839PyObject *
5840PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005841 Py_ssize_t size,
5842 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005844 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005845 Py_ssize_t startinpos;
5846 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005847 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 const char *end;
5849 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 PyObject *errorHandler = NULL;
5851 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005852
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005853 if (size == 0)
5854 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 /* Escaped strings will always be longer than the resulting
5857 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005858 length after conversion to the true value. (But decoding error
5859 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005860 _PyUnicodeWriter_Init(&writer);
5861 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005862
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 end = s + size;
5864 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 unsigned char c;
5866 Py_UCS4 x;
5867 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005868 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 /* Non-escape characters are interpreted as Unicode ordinals */
5871 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005872 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005873 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005874 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005876 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 startinpos = s-starts;
5878
5879 /* \u-escapes are only interpreted iff the number of leading
5880 backslashes if odd */
5881 bs = s;
5882 for (;s < end;) {
5883 if (*s != '\\')
5884 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005885 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005886 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005887 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 }
5889 if (((s - bs) & 1) == 0 ||
5890 s >= end ||
5891 (*s != 'u' && *s != 'U')) {
5892 continue;
5893 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 count = *s=='u' ? 4 : 8;
5896 s++;
5897
5898 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 for (x = 0, i = 0; i < count; ++i, ++s) {
5900 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005901 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005903 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 errors, &errorHandler,
5905 "rawunicodeescape", "truncated \\uXXXX",
5906 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005907 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 goto onError;
5909 goto nextByte;
5910 }
5911 x = (x<<4) & ~0xF;
5912 if (c >= '0' && c <= '9')
5913 x += c - '0';
5914 else if (c >= 'a' && c <= 'f')
5915 x += 10 + c - 'a';
5916 else
5917 x += 10 + c - 'A';
5918 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005919 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005920 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005921 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005922 }
5923 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005924 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005925 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005926 errors, &errorHandler,
5927 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005929 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005931 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 nextByte:
5933 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 Py_XDECREF(errorHandler);
5936 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005937 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005938
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005940 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941 Py_XDECREF(errorHandler);
5942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 return NULL;
5944}
5945
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005946
Alexander Belopolsky40018472011-02-26 01:02:56 +00005947PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005948PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005950 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 char *p;
5952 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005953 Py_ssize_t expandsize, pos;
5954 int kind;
5955 void *data;
5956 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005958 if (!PyUnicode_Check(unicode)) {
5959 PyErr_BadArgument();
5960 return NULL;
5961 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005962 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005963 return NULL;
5964 kind = PyUnicode_KIND(unicode);
5965 data = PyUnicode_DATA(unicode);
5966 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005967 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5968 bytes, and 1 byte characters 4. */
5969 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005970
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005971 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005973
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005974 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 if (repr == NULL)
5976 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005977 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005978 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005980 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005981 for (pos = 0; pos < len; pos++) {
5982 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 /* Map 32-bit characters to '\Uxxxxxxxx' */
5984 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005985 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005986 *p++ = '\\';
5987 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005988 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5989 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5990 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5991 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5992 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5993 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5994 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5995 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005996 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005998 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 *p++ = '\\';
6000 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006001 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6002 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6003 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6004 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 /* Copy everything else as-is */
6007 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 *p++ = (char) ch;
6009 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006010
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006011 assert(p > q);
6012 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006013 return NULL;
6014 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015}
6016
Alexander Belopolsky40018472011-02-26 01:02:56 +00006017PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006018PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6019 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006021 PyObject *result;
6022 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6023 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006024 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006025 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6026 Py_DECREF(tmp);
6027 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006030/* --- Unicode Internal Codec ------------------------------------------- */
6031
Alexander Belopolsky40018472011-02-26 01:02:56 +00006032PyObject *
6033_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006034 Py_ssize_t size,
6035 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006036{
6037 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006038 Py_ssize_t startinpos;
6039 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006040 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006041 const char *end;
6042 const char *reason;
6043 PyObject *errorHandler = NULL;
6044 PyObject *exc = NULL;
6045
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006046 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006047 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006048 1))
6049 return NULL;
6050
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006051 if (size == 0)
6052 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053
Victor Stinner8f674cc2013-04-17 23:02:17 +02006054 _PyUnicodeWriter_Init(&writer);
6055 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6056 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006058 }
6059 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006060
Victor Stinner8f674cc2013-04-17 23:02:17 +02006061 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006062 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006063 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006064 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006065 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006066 endinpos = end-starts;
6067 reason = "truncated input";
6068 goto error;
6069 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006070 /* We copy the raw representation one byte at a time because the
6071 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006072 ((char *) &uch)[0] = s[0];
6073 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006074#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006075 ((char *) &uch)[2] = s[2];
6076 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006077#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006078 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006079#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006080 /* We have to sanity check the raw data, otherwise doom looms for
6081 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006082 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006083 endinpos = s - starts + Py_UNICODE_SIZE;
6084 reason = "illegal code point (> 0x10FFFF)";
6085 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006086 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006087#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006088 s += Py_UNICODE_SIZE;
6089#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006090 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006091 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006092 Py_UNICODE uch2;
6093 ((char *) &uch2)[0] = s[0];
6094 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006095 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006096 {
Victor Stinner551ac952011-11-29 22:58:13 +01006097 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006098 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006099 }
6100 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006101#endif
6102
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006103 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006104 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006105 continue;
6106
6107 error:
6108 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006109 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006110 errors, &errorHandler,
6111 "unicode_internal", reason,
6112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006113 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006114 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006115 }
6116
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006117 Py_XDECREF(errorHandler);
6118 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006119 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006120
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006122 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006123 Py_XDECREF(errorHandler);
6124 Py_XDECREF(exc);
6125 return NULL;
6126}
6127
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128/* --- Latin-1 Codec ------------------------------------------------------ */
6129
Alexander Belopolsky40018472011-02-26 01:02:56 +00006130PyObject *
6131PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006132 Py_ssize_t size,
6133 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006136 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137}
6138
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006140static void
6141make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006142 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006143 PyObject *unicode,
6144 Py_ssize_t startpos, Py_ssize_t endpos,
6145 const char *reason)
6146{
6147 if (*exceptionObject == NULL) {
6148 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006149 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006150 encoding, unicode, startpos, endpos, reason);
6151 }
6152 else {
6153 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6154 goto onError;
6155 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6156 goto onError;
6157 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6158 goto onError;
6159 return;
6160 onError:
6161 Py_DECREF(*exceptionObject);
6162 *exceptionObject = NULL;
6163 }
6164}
6165
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006166/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006167static void
6168raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006169 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006170 PyObject *unicode,
6171 Py_ssize_t startpos, Py_ssize_t endpos,
6172 const char *reason)
6173{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006174 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006175 encoding, unicode, startpos, endpos, reason);
6176 if (*exceptionObject != NULL)
6177 PyCodec_StrictErrors(*exceptionObject);
6178}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179
6180/* error handling callback helper:
6181 build arguments, call the callback and check the arguments,
6182 put the result into newpos and return the replacement string, which
6183 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006184static PyObject *
6185unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006186 PyObject **errorHandler,
6187 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006188 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006189 Py_ssize_t startpos, Py_ssize_t endpos,
6190 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006191{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006192 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006193 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006194 PyObject *restuple;
6195 PyObject *resunicode;
6196
6197 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201 }
6202
Benjamin Petersonbac79492012-01-14 13:34:47 -05006203 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006204 return NULL;
6205 len = PyUnicode_GET_LENGTH(unicode);
6206
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006207 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006208 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006211
6212 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006216 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006217 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 Py_DECREF(restuple);
6219 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006220 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006221 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 &resunicode, newpos)) {
6223 Py_DECREF(restuple);
6224 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006225 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006226 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6227 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6228 Py_DECREF(restuple);
6229 return NULL;
6230 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006231 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006232 *newpos = len + *newpos;
6233 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6235 Py_DECREF(restuple);
6236 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006237 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238 Py_INCREF(resunicode);
6239 Py_DECREF(restuple);
6240 return resunicode;
6241}
6242
Alexander Belopolsky40018472011-02-26 01:02:56 +00006243static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006244unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006245 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006246 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006247{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006248 /* input state */
6249 Py_ssize_t pos=0, size;
6250 int kind;
6251 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006252 /* output object */
6253 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254 /* pointer into the output */
6255 char *str;
6256 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006257 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006258 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6259 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006260 PyObject *errorHandler = NULL;
6261 PyObject *exc = NULL;
6262 /* the following variable is used for caching string comparisons
6263 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6264 int known_errorHandler = -1;
6265
Benjamin Petersonbac79492012-01-14 13:34:47 -05006266 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006267 return NULL;
6268 size = PyUnicode_GET_LENGTH(unicode);
6269 kind = PyUnicode_KIND(unicode);
6270 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271 /* allocate enough for a simple encoding without
6272 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006273 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006274 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006275 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006276 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006277 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006278 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 ressize = size;
6280
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006281 while (pos < size) {
6282 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 /* can we encode this? */
6285 if (c<limit) {
6286 /* no overflow check, because we know that the space is enough */
6287 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006288 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 Py_ssize_t requiredsize;
6292 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006293 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006295 Py_ssize_t collstart = pos;
6296 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006298 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 ++collend;
6300 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6301 if (known_errorHandler==-1) {
6302 if ((errors==NULL) || (!strcmp(errors, "strict")))
6303 known_errorHandler = 1;
6304 else if (!strcmp(errors, "replace"))
6305 known_errorHandler = 2;
6306 else if (!strcmp(errors, "ignore"))
6307 known_errorHandler = 3;
6308 else if (!strcmp(errors, "xmlcharrefreplace"))
6309 known_errorHandler = 4;
6310 else
6311 known_errorHandler = 0;
6312 }
6313 switch (known_errorHandler) {
6314 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006315 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 goto onError;
6317 case 2: /* replace */
6318 while (collstart++<collend)
6319 *str++ = '?'; /* fall through */
6320 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006321 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 break;
6323 case 4: /* xmlcharrefreplace */
6324 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006325 /* determine replacement size */
6326 for (i = collstart, repsize = 0; i < collend; ++i) {
6327 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6328 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006330 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006332 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006334 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006336 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006338 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006340 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006341 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006345 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 if (requiredsize > ressize) {
6347 if (requiredsize<2*ressize)
6348 requiredsize = 2*ressize;
6349 if (_PyBytes_Resize(&res, requiredsize))
6350 goto onError;
6351 str = PyBytes_AS_STRING(res) + respos;
6352 ressize = requiredsize;
6353 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006354 /* generate replacement */
6355 for (i = collstart; i < collend; ++i) {
6356 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 break;
6360 default:
6361 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006362 encoding, reason, unicode, &exc,
6363 collstart, collend, &newpos);
6364 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006365 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006367 if (PyBytes_Check(repunicode)) {
6368 /* Directly copy bytes result to output. */
6369 repsize = PyBytes_Size(repunicode);
6370 if (repsize > 1) {
6371 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006372 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006373 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6374 Py_DECREF(repunicode);
6375 goto onError;
6376 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006377 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006378 ressize += repsize-1;
6379 }
6380 memcpy(str, PyBytes_AsString(repunicode), repsize);
6381 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006382 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006383 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006384 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006385 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 /* need more space? (at least enough for what we
6387 have+the replacement+the rest of the string, so
6388 we won't have to check space for encodable characters) */
6389 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006390 repsize = PyUnicode_GET_LENGTH(repunicode);
6391 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 if (requiredsize > ressize) {
6393 if (requiredsize<2*ressize)
6394 requiredsize = 2*ressize;
6395 if (_PyBytes_Resize(&res, requiredsize)) {
6396 Py_DECREF(repunicode);
6397 goto onError;
6398 }
6399 str = PyBytes_AS_STRING(res) + respos;
6400 ressize = requiredsize;
6401 }
6402 /* check if there is anything unencodable in the replacement
6403 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006404 for (i = 0; repsize-->0; ++i, ++str) {
6405 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006407 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 Py_DECREF(repunicode);
6410 goto onError;
6411 }
6412 *str = (char)c;
6413 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006415 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006416 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006417 }
6418 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006419 /* Resize if we allocated to much */
6420 size = str - PyBytes_AS_STRING(res);
6421 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006422 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006423 if (_PyBytes_Resize(&res, size) < 0)
6424 goto onError;
6425 }
6426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 Py_XDECREF(errorHandler);
6428 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006429 return res;
6430
6431 onError:
6432 Py_XDECREF(res);
6433 Py_XDECREF(errorHandler);
6434 Py_XDECREF(exc);
6435 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436}
6437
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006438/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006439PyObject *
6440PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006441 Py_ssize_t size,
6442 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444 PyObject *result;
6445 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6446 if (unicode == NULL)
6447 return NULL;
6448 result = unicode_encode_ucs1(unicode, errors, 256);
6449 Py_DECREF(unicode);
6450 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451}
6452
Alexander Belopolsky40018472011-02-26 01:02:56 +00006453PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006454_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455{
6456 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 PyErr_BadArgument();
6458 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006460 if (PyUnicode_READY(unicode) == -1)
6461 return NULL;
6462 /* Fast path: if it is a one-byte string, construct
6463 bytes object directly. */
6464 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6465 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6466 PyUnicode_GET_LENGTH(unicode));
6467 /* Non-Latin-1 characters present. Defer to above function to
6468 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006469 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006470}
6471
6472PyObject*
6473PyUnicode_AsLatin1String(PyObject *unicode)
6474{
6475 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476}
6477
6478/* --- 7-bit ASCII Codec -------------------------------------------------- */
6479
Alexander Belopolsky40018472011-02-26 01:02:56 +00006480PyObject *
6481PyUnicode_DecodeASCII(const char *s,
6482 Py_ssize_t size,
6483 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006485 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006486 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006487 int kind;
6488 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006489 Py_ssize_t startinpos;
6490 Py_ssize_t endinpos;
6491 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006492 const char *e;
6493 PyObject *errorHandler = NULL;
6494 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006495
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006497 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006498
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006500 if (size == 1 && (unsigned char)s[0] < 128)
6501 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006502
Victor Stinner8f674cc2013-04-17 23:02:17 +02006503 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006504 writer.min_length = size;
6505 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006506 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006507
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006508 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006509 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006510 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006511 writer.pos = outpos;
6512 if (writer.pos == size)
6513 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006514
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006515 s += writer.pos;
6516 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006517 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006518 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006520 PyUnicode_WRITE(kind, data, writer.pos, c);
6521 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 ++s;
6523 }
6524 else {
6525 startinpos = s-starts;
6526 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006527 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 errors, &errorHandler,
6529 "ascii", "ordinal not in range(128)",
6530 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006531 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006533 kind = writer.kind;
6534 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006537 Py_XDECREF(errorHandler);
6538 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006539 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006540
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006542 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 Py_XDECREF(errorHandler);
6544 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 return NULL;
6546}
6547
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006548/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006549PyObject *
6550PyUnicode_EncodeASCII(const Py_UNICODE *p,
6551 Py_ssize_t size,
6552 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554 PyObject *result;
6555 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6556 if (unicode == NULL)
6557 return NULL;
6558 result = unicode_encode_ucs1(unicode, errors, 128);
6559 Py_DECREF(unicode);
6560 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561}
6562
Alexander Belopolsky40018472011-02-26 01:02:56 +00006563PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006564_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565{
6566 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 PyErr_BadArgument();
6568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006570 if (PyUnicode_READY(unicode) == -1)
6571 return NULL;
6572 /* Fast path: if it is an ASCII-only string, construct bytes object
6573 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006574 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006575 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6576 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006578}
6579
6580PyObject *
6581PyUnicode_AsASCIIString(PyObject *unicode)
6582{
6583 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584}
6585
Victor Stinner99b95382011-07-04 14:23:54 +02006586#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006587
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006588/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006589
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006590#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006591#define NEED_RETRY
6592#endif
6593
Victor Stinner3a50e702011-10-18 21:21:00 +02006594#ifndef WC_ERR_INVALID_CHARS
6595# define WC_ERR_INVALID_CHARS 0x0080
6596#endif
6597
6598static char*
6599code_page_name(UINT code_page, PyObject **obj)
6600{
6601 *obj = NULL;
6602 if (code_page == CP_ACP)
6603 return "mbcs";
6604 if (code_page == CP_UTF7)
6605 return "CP_UTF7";
6606 if (code_page == CP_UTF8)
6607 return "CP_UTF8";
6608
6609 *obj = PyBytes_FromFormat("cp%u", code_page);
6610 if (*obj == NULL)
6611 return NULL;
6612 return PyBytes_AS_STRING(*obj);
6613}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006614
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006616is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617{
6618 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006619 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006620
Victor Stinner3a50e702011-10-18 21:21:00 +02006621 if (!IsDBCSLeadByteEx(code_page, *curr))
6622 return 0;
6623
6624 prev = CharPrevExA(code_page, s, curr, 0);
6625 if (prev == curr)
6626 return 1;
6627 /* FIXME: This code is limited to "true" double-byte encodings,
6628 as it assumes an incomplete character consists of a single
6629 byte. */
6630 if (curr - prev == 2)
6631 return 1;
6632 if (!IsDBCSLeadByteEx(code_page, *prev))
6633 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006634 return 0;
6635}
6636
Victor Stinner3a50e702011-10-18 21:21:00 +02006637static DWORD
6638decode_code_page_flags(UINT code_page)
6639{
6640 if (code_page == CP_UTF7) {
6641 /* The CP_UTF7 decoder only supports flags=0 */
6642 return 0;
6643 }
6644 else
6645 return MB_ERR_INVALID_CHARS;
6646}
6647
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006648/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006649 * Decode a byte string from a Windows code page into unicode object in strict
6650 * mode.
6651 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006652 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6653 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006654 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006655static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006656decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006657 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006658 const char *in,
6659 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006660{
Victor Stinner3a50e702011-10-18 21:21:00 +02006661 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006662 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006663 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006664
6665 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006666 assert(insize > 0);
6667 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6668 if (outsize <= 0)
6669 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006670
6671 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006673 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006674 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 if (*v == NULL)
6676 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006677 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006678 }
6679 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006681 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006682 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006684 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006685 }
6686
6687 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006688 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6689 if (outsize <= 0)
6690 goto error;
6691 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006692
Victor Stinner3a50e702011-10-18 21:21:00 +02006693error:
6694 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6695 return -2;
6696 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006697 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006698}
6699
Victor Stinner3a50e702011-10-18 21:21:00 +02006700/*
6701 * Decode a byte string from a code page into unicode object with an error
6702 * handler.
6703 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006704 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006705 * UnicodeDecodeError exception and returns -1 on error.
6706 */
6707static int
6708decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006709 PyObject **v,
6710 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006711 const char *errors)
6712{
6713 const char *startin = in;
6714 const char *endin = in + size;
6715 const DWORD flags = decode_code_page_flags(code_page);
6716 /* Ideally, we should get reason from FormatMessage. This is the Windows
6717 2000 English version of the message. */
6718 const char *reason = "No mapping for the Unicode character exists "
6719 "in the target code page.";
6720 /* each step cannot decode more than 1 character, but a character can be
6721 represented as a surrogate pair */
6722 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006723 int insize;
6724 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006725 PyObject *errorHandler = NULL;
6726 PyObject *exc = NULL;
6727 PyObject *encoding_obj = NULL;
6728 char *encoding;
6729 DWORD err;
6730 int ret = -1;
6731
6732 assert(size > 0);
6733
6734 encoding = code_page_name(code_page, &encoding_obj);
6735 if (encoding == NULL)
6736 return -1;
6737
6738 if (errors == NULL || strcmp(errors, "strict") == 0) {
6739 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6740 UnicodeDecodeError. */
6741 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6742 if (exc != NULL) {
6743 PyCodec_StrictErrors(exc);
6744 Py_CLEAR(exc);
6745 }
6746 goto error;
6747 }
6748
6749 if (*v == NULL) {
6750 /* Create unicode object */
6751 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6752 PyErr_NoMemory();
6753 goto error;
6754 }
Victor Stinnerab595942011-12-17 04:59:06 +01006755 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006756 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006757 if (*v == NULL)
6758 goto error;
6759 startout = PyUnicode_AS_UNICODE(*v);
6760 }
6761 else {
6762 /* Extend unicode object */
6763 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6764 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6765 PyErr_NoMemory();
6766 goto error;
6767 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006768 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006769 goto error;
6770 startout = PyUnicode_AS_UNICODE(*v) + n;
6771 }
6772
6773 /* Decode the byte string character per character */
6774 out = startout;
6775 while (in < endin)
6776 {
6777 /* Decode a character */
6778 insize = 1;
6779 do
6780 {
6781 outsize = MultiByteToWideChar(code_page, flags,
6782 in, insize,
6783 buffer, Py_ARRAY_LENGTH(buffer));
6784 if (outsize > 0)
6785 break;
6786 err = GetLastError();
6787 if (err != ERROR_NO_UNICODE_TRANSLATION
6788 && err != ERROR_INSUFFICIENT_BUFFER)
6789 {
6790 PyErr_SetFromWindowsErr(0);
6791 goto error;
6792 }
6793 insize++;
6794 }
6795 /* 4=maximum length of a UTF-8 sequence */
6796 while (insize <= 4 && (in + insize) <= endin);
6797
6798 if (outsize <= 0) {
6799 Py_ssize_t startinpos, endinpos, outpos;
6800
6801 startinpos = in - startin;
6802 endinpos = startinpos + 1;
6803 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006804 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006805 errors, &errorHandler,
6806 encoding, reason,
6807 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006808 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006809 {
6810 goto error;
6811 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006812 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006813 }
6814 else {
6815 in += insize;
6816 memcpy(out, buffer, outsize * sizeof(wchar_t));
6817 out += outsize;
6818 }
6819 }
6820
6821 /* write a NUL character at the end */
6822 *out = 0;
6823
6824 /* Extend unicode object */
6825 outsize = out - startout;
6826 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006827 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006828 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006829 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006830
6831error:
6832 Py_XDECREF(encoding_obj);
6833 Py_XDECREF(errorHandler);
6834 Py_XDECREF(exc);
6835 return ret;
6836}
6837
Victor Stinner3a50e702011-10-18 21:21:00 +02006838static PyObject *
6839decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006840 const char *s, Py_ssize_t size,
6841 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006842{
Victor Stinner76a31a62011-11-04 00:05:13 +01006843 PyObject *v = NULL;
6844 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006845
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 if (code_page < 0) {
6847 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6848 return NULL;
6849 }
6850
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006851 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
Victor Stinner76a31a62011-11-04 00:05:13 +01006854 do
6855 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006856#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006857 if (size > INT_MAX) {
6858 chunk_size = INT_MAX;
6859 final = 0;
6860 done = 0;
6861 }
6862 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006863#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006864 {
6865 chunk_size = (int)size;
6866 final = (consumed == NULL);
6867 done = 1;
6868 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869
Victor Stinner76a31a62011-11-04 00:05:13 +01006870 /* Skip trailing lead-byte unless 'final' is set */
6871 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6872 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873
Victor Stinner76a31a62011-11-04 00:05:13 +01006874 if (chunk_size == 0 && done) {
6875 if (v != NULL)
6876 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006877 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006878 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879
Victor Stinner76a31a62011-11-04 00:05:13 +01006880
6881 converted = decode_code_page_strict(code_page, &v,
6882 s, chunk_size);
6883 if (converted == -2)
6884 converted = decode_code_page_errors(code_page, &v,
6885 s, chunk_size,
6886 errors);
6887 assert(converted != 0);
6888
6889 if (converted < 0) {
6890 Py_XDECREF(v);
6891 return NULL;
6892 }
6893
6894 if (consumed)
6895 *consumed += converted;
6896
6897 s += converted;
6898 size -= converted;
6899 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006900
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006901 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902}
6903
Alexander Belopolsky40018472011-02-26 01:02:56 +00006904PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006905PyUnicode_DecodeCodePageStateful(int code_page,
6906 const char *s,
6907 Py_ssize_t size,
6908 const char *errors,
6909 Py_ssize_t *consumed)
6910{
6911 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6912}
6913
6914PyObject *
6915PyUnicode_DecodeMBCSStateful(const char *s,
6916 Py_ssize_t size,
6917 const char *errors,
6918 Py_ssize_t *consumed)
6919{
6920 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6921}
6922
6923PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006924PyUnicode_DecodeMBCS(const char *s,
6925 Py_ssize_t size,
6926 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006927{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006928 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6929}
6930
Victor Stinner3a50e702011-10-18 21:21:00 +02006931static DWORD
6932encode_code_page_flags(UINT code_page, const char *errors)
6933{
6934 if (code_page == CP_UTF8) {
6935 if (winver.dwMajorVersion >= 6)
6936 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6937 and later */
6938 return WC_ERR_INVALID_CHARS;
6939 else
6940 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6941 return 0;
6942 }
6943 else if (code_page == CP_UTF7) {
6944 /* CP_UTF7 only supports flags=0 */
6945 return 0;
6946 }
6947 else {
6948 if (errors != NULL && strcmp(errors, "replace") == 0)
6949 return 0;
6950 else
6951 return WC_NO_BEST_FIT_CHARS;
6952 }
6953}
6954
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006956 * Encode a Unicode string to a Windows code page into a byte string in strict
6957 * mode.
6958 *
6959 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006960 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006962static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006963encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006964 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006966{
Victor Stinner554f3f02010-06-16 23:33:54 +00006967 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006968 BOOL *pusedDefaultChar = &usedDefaultChar;
6969 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006970 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006971 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006972 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 const DWORD flags = encode_code_page_flags(code_page, NULL);
6974 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006975 /* Create a substring so that we can get the UTF-16 representation
6976 of just the slice under consideration. */
6977 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006978
Martin v. Löwis3d325192011-11-04 18:23:06 +01006979 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006980
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006982 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006983 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006984 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006985
Victor Stinner2fc507f2011-11-04 20:06:39 +01006986 substring = PyUnicode_Substring(unicode, offset, offset+len);
6987 if (substring == NULL)
6988 return -1;
6989 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6990 if (p == NULL) {
6991 Py_DECREF(substring);
6992 return -1;
6993 }
Victor Stinner9f067f42013-06-05 00:21:31 +02006994 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01006995
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006996 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006997 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02006998 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006999 NULL, 0,
7000 NULL, pusedDefaultChar);
7001 if (outsize <= 0)
7002 goto error;
7003 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007004 if (pusedDefaultChar && *pusedDefaultChar) {
7005 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007006 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007007 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007008
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007012 if (*outbytes == NULL) {
7013 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007015 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007016 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007017 }
7018 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007020 const Py_ssize_t n = PyBytes_Size(*outbytes);
7021 if (outsize > PY_SSIZE_T_MAX - n) {
7022 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007023 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007026 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7027 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007029 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007030 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007031 }
7032
7033 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007034 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007035 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007036 out, outsize,
7037 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007038 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007039 if (outsize <= 0)
7040 goto error;
7041 if (pusedDefaultChar && *pusedDefaultChar)
7042 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007043 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007044
Victor Stinner3a50e702011-10-18 21:21:00 +02007045error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007046 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007047 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7048 return -2;
7049 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007050 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007051}
7052
Victor Stinner3a50e702011-10-18 21:21:00 +02007053/*
7054 * Encode a Unicode string to a Windows code page into a byte string using a
7055 * error handler.
7056 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007057 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 * -1 on other error.
7059 */
7060static int
7061encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007062 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007063 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007064{
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007066 Py_ssize_t pos = unicode_offset;
7067 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007068 /* Ideally, we should get reason from FormatMessage. This is the Windows
7069 2000 English version of the message. */
7070 const char *reason = "invalid character";
7071 /* 4=maximum length of a UTF-8 sequence */
7072 char buffer[4];
7073 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7074 Py_ssize_t outsize;
7075 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007076 PyObject *errorHandler = NULL;
7077 PyObject *exc = NULL;
7078 PyObject *encoding_obj = NULL;
7079 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007080 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 PyObject *rep;
7082 int ret = -1;
7083
7084 assert(insize > 0);
7085
7086 encoding = code_page_name(code_page, &encoding_obj);
7087 if (encoding == NULL)
7088 return -1;
7089
7090 if (errors == NULL || strcmp(errors, "strict") == 0) {
7091 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7092 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007093 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 if (exc != NULL) {
7095 PyCodec_StrictErrors(exc);
7096 Py_DECREF(exc);
7097 }
7098 Py_XDECREF(encoding_obj);
7099 return -1;
7100 }
7101
7102 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7103 pusedDefaultChar = &usedDefaultChar;
7104 else
7105 pusedDefaultChar = NULL;
7106
7107 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7108 PyErr_NoMemory();
7109 goto error;
7110 }
7111 outsize = insize * Py_ARRAY_LENGTH(buffer);
7112
7113 if (*outbytes == NULL) {
7114 /* Create string object */
7115 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7116 if (*outbytes == NULL)
7117 goto error;
7118 out = PyBytes_AS_STRING(*outbytes);
7119 }
7120 else {
7121 /* Extend string object */
7122 Py_ssize_t n = PyBytes_Size(*outbytes);
7123 if (n > PY_SSIZE_T_MAX - outsize) {
7124 PyErr_NoMemory();
7125 goto error;
7126 }
7127 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7128 goto error;
7129 out = PyBytes_AS_STRING(*outbytes) + n;
7130 }
7131
7132 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007133 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007134 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007135 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7136 wchar_t chars[2];
7137 int charsize;
7138 if (ch < 0x10000) {
7139 chars[0] = (wchar_t)ch;
7140 charsize = 1;
7141 }
7142 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007143 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7144 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007145 charsize = 2;
7146 }
7147
Victor Stinner3a50e702011-10-18 21:21:00 +02007148 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007149 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 buffer, Py_ARRAY_LENGTH(buffer),
7151 NULL, pusedDefaultChar);
7152 if (outsize > 0) {
7153 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7154 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007155 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 memcpy(out, buffer, outsize);
7157 out += outsize;
7158 continue;
7159 }
7160 }
7161 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7162 PyErr_SetFromWindowsErr(0);
7163 goto error;
7164 }
7165
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 rep = unicode_encode_call_errorhandler(
7167 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007168 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007169 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 if (rep == NULL)
7171 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007172 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007173
7174 if (PyBytes_Check(rep)) {
7175 outsize = PyBytes_GET_SIZE(rep);
7176 if (outsize != 1) {
7177 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7178 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7179 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7180 Py_DECREF(rep);
7181 goto error;
7182 }
7183 out = PyBytes_AS_STRING(*outbytes) + offset;
7184 }
7185 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7186 out += outsize;
7187 }
7188 else {
7189 Py_ssize_t i;
7190 enum PyUnicode_Kind kind;
7191 void *data;
7192
Benjamin Petersonbac79492012-01-14 13:34:47 -05007193 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 Py_DECREF(rep);
7195 goto error;
7196 }
7197
7198 outsize = PyUnicode_GET_LENGTH(rep);
7199 if (outsize != 1) {
7200 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7201 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7202 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7203 Py_DECREF(rep);
7204 goto error;
7205 }
7206 out = PyBytes_AS_STRING(*outbytes) + offset;
7207 }
7208 kind = PyUnicode_KIND(rep);
7209 data = PyUnicode_DATA(rep);
7210 for (i=0; i < outsize; i++) {
7211 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7212 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007213 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007214 encoding, unicode,
7215 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 "unable to encode error handler result to ASCII");
7217 Py_DECREF(rep);
7218 goto error;
7219 }
7220 *out = (unsigned char)ch;
7221 out++;
7222 }
7223 }
7224 Py_DECREF(rep);
7225 }
7226 /* write a NUL byte */
7227 *out = 0;
7228 outsize = out - PyBytes_AS_STRING(*outbytes);
7229 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7230 if (_PyBytes_Resize(outbytes, outsize) < 0)
7231 goto error;
7232 ret = 0;
7233
7234error:
7235 Py_XDECREF(encoding_obj);
7236 Py_XDECREF(errorHandler);
7237 Py_XDECREF(exc);
7238 return ret;
7239}
7240
Victor Stinner3a50e702011-10-18 21:21:00 +02007241static PyObject *
7242encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007243 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 const char *errors)
7245{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007246 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007248 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007249 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007250
Benjamin Petersonbac79492012-01-14 13:34:47 -05007251 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007252 return NULL;
7253 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007254
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 if (code_page < 0) {
7256 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7257 return NULL;
7258 }
7259
Martin v. Löwis3d325192011-11-04 18:23:06 +01007260 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007261 return PyBytes_FromStringAndSize(NULL, 0);
7262
Victor Stinner7581cef2011-11-03 22:32:33 +01007263 offset = 0;
7264 do
7265 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007266#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007267 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007268 chunks. */
7269 if (len > INT_MAX/2) {
7270 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007271 done = 0;
7272 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007273 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007274#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007275 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007276 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007277 done = 1;
7278 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007279
Victor Stinner76a31a62011-11-04 00:05:13 +01007280 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007281 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007282 errors);
7283 if (ret == -2)
7284 ret = encode_code_page_errors(code_page, &outbytes,
7285 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007286 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007287 if (ret < 0) {
7288 Py_XDECREF(outbytes);
7289 return NULL;
7290 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007291
Victor Stinner7581cef2011-11-03 22:32:33 +01007292 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007293 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007294 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007295
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 return outbytes;
7297}
7298
7299PyObject *
7300PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7301 Py_ssize_t size,
7302 const char *errors)
7303{
Victor Stinner7581cef2011-11-03 22:32:33 +01007304 PyObject *unicode, *res;
7305 unicode = PyUnicode_FromUnicode(p, size);
7306 if (unicode == NULL)
7307 return NULL;
7308 res = encode_code_page(CP_ACP, unicode, errors);
7309 Py_DECREF(unicode);
7310 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007311}
7312
7313PyObject *
7314PyUnicode_EncodeCodePage(int code_page,
7315 PyObject *unicode,
7316 const char *errors)
7317{
Victor Stinner7581cef2011-11-03 22:32:33 +01007318 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007319}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007320
Alexander Belopolsky40018472011-02-26 01:02:56 +00007321PyObject *
7322PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007323{
7324 if (!PyUnicode_Check(unicode)) {
7325 PyErr_BadArgument();
7326 return NULL;
7327 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007328 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007329}
7330
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007331#undef NEED_RETRY
7332
Victor Stinner99b95382011-07-04 14:23:54 +02007333#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007334
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335/* --- Character Mapping Codec -------------------------------------------- */
7336
Victor Stinnerfb161b12013-04-18 01:44:27 +02007337static int
7338charmap_decode_string(const char *s,
7339 Py_ssize_t size,
7340 PyObject *mapping,
7341 const char *errors,
7342 _PyUnicodeWriter *writer)
7343{
7344 const char *starts = s;
7345 const char *e;
7346 Py_ssize_t startinpos, endinpos;
7347 PyObject *errorHandler = NULL, *exc = NULL;
7348 Py_ssize_t maplen;
7349 enum PyUnicode_Kind mapkind;
7350 void *mapdata;
7351 Py_UCS4 x;
7352 unsigned char ch;
7353
7354 if (PyUnicode_READY(mapping) == -1)
7355 return -1;
7356
7357 maplen = PyUnicode_GET_LENGTH(mapping);
7358 mapdata = PyUnicode_DATA(mapping);
7359 mapkind = PyUnicode_KIND(mapping);
7360
7361 e = s + size;
7362
7363 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7364 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7365 * is disabled in encoding aliases, latin1 is preferred because
7366 * its implementation is faster. */
7367 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7368 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7369 Py_UCS4 maxchar = writer->maxchar;
7370
7371 assert (writer->kind == PyUnicode_1BYTE_KIND);
7372 while (s < e) {
7373 ch = *s;
7374 x = mapdata_ucs1[ch];
7375 if (x > maxchar) {
7376 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7377 goto onError;
7378 maxchar = writer->maxchar;
7379 outdata = (Py_UCS1 *)writer->data;
7380 }
7381 outdata[writer->pos] = x;
7382 writer->pos++;
7383 ++s;
7384 }
7385 return 0;
7386 }
7387
7388 while (s < e) {
7389 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7390 enum PyUnicode_Kind outkind = writer->kind;
7391 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7392 if (outkind == PyUnicode_1BYTE_KIND) {
7393 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7394 Py_UCS4 maxchar = writer->maxchar;
7395 while (s < e) {
7396 ch = *s;
7397 x = mapdata_ucs2[ch];
7398 if (x > maxchar)
7399 goto Error;
7400 outdata[writer->pos] = x;
7401 writer->pos++;
7402 ++s;
7403 }
7404 break;
7405 }
7406 else if (outkind == PyUnicode_2BYTE_KIND) {
7407 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7408 while (s < e) {
7409 ch = *s;
7410 x = mapdata_ucs2[ch];
7411 if (x == 0xFFFE)
7412 goto Error;
7413 outdata[writer->pos] = x;
7414 writer->pos++;
7415 ++s;
7416 }
7417 break;
7418 }
7419 }
7420 ch = *s;
7421
7422 if (ch < maplen)
7423 x = PyUnicode_READ(mapkind, mapdata, ch);
7424 else
7425 x = 0xfffe; /* invalid value */
7426Error:
7427 if (x == 0xfffe)
7428 {
7429 /* undefined mapping */
7430 startinpos = s-starts;
7431 endinpos = startinpos+1;
7432 if (unicode_decode_call_errorhandler_writer(
7433 errors, &errorHandler,
7434 "charmap", "character maps to <undefined>",
7435 &starts, &e, &startinpos, &endinpos, &exc, &s,
7436 writer)) {
7437 goto onError;
7438 }
7439 continue;
7440 }
7441
7442 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7443 goto onError;
7444 ++s;
7445 }
7446 Py_XDECREF(errorHandler);
7447 Py_XDECREF(exc);
7448 return 0;
7449
7450onError:
7451 Py_XDECREF(errorHandler);
7452 Py_XDECREF(exc);
7453 return -1;
7454}
7455
7456static int
7457charmap_decode_mapping(const char *s,
7458 Py_ssize_t size,
7459 PyObject *mapping,
7460 const char *errors,
7461 _PyUnicodeWriter *writer)
7462{
7463 const char *starts = s;
7464 const char *e;
7465 Py_ssize_t startinpos, endinpos;
7466 PyObject *errorHandler = NULL, *exc = NULL;
7467 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007468 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007469
7470 e = s + size;
7471
7472 while (s < e) {
7473 ch = *s;
7474
7475 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7476 key = PyLong_FromLong((long)ch);
7477 if (key == NULL)
7478 goto onError;
7479
7480 item = PyObject_GetItem(mapping, key);
7481 Py_DECREF(key);
7482 if (item == NULL) {
7483 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7484 /* No mapping found means: mapping is undefined. */
7485 PyErr_Clear();
7486 goto Undefined;
7487 } else
7488 goto onError;
7489 }
7490
7491 /* Apply mapping */
7492 if (item == Py_None)
7493 goto Undefined;
7494 if (PyLong_Check(item)) {
7495 long value = PyLong_AS_LONG(item);
7496 if (value == 0xFFFE)
7497 goto Undefined;
7498 if (value < 0 || value > MAX_UNICODE) {
7499 PyErr_Format(PyExc_TypeError,
7500 "character mapping must be in range(0x%lx)",
7501 (unsigned long)MAX_UNICODE + 1);
7502 goto onError;
7503 }
7504
7505 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7506 goto onError;
7507 }
7508 else if (PyUnicode_Check(item)) {
7509 if (PyUnicode_READY(item) == -1)
7510 goto onError;
7511 if (PyUnicode_GET_LENGTH(item) == 1) {
7512 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7513 if (value == 0xFFFE)
7514 goto Undefined;
7515 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7516 goto onError;
7517 }
7518 else {
7519 writer->overallocate = 1;
7520 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7521 goto onError;
7522 }
7523 }
7524 else {
7525 /* wrong return value */
7526 PyErr_SetString(PyExc_TypeError,
7527 "character mapping must return integer, None or str");
7528 goto onError;
7529 }
7530 Py_CLEAR(item);
7531 ++s;
7532 continue;
7533
7534Undefined:
7535 /* undefined mapping */
7536 Py_CLEAR(item);
7537 startinpos = s-starts;
7538 endinpos = startinpos+1;
7539 if (unicode_decode_call_errorhandler_writer(
7540 errors, &errorHandler,
7541 "charmap", "character maps to <undefined>",
7542 &starts, &e, &startinpos, &endinpos, &exc, &s,
7543 writer)) {
7544 goto onError;
7545 }
7546 }
7547 Py_XDECREF(errorHandler);
7548 Py_XDECREF(exc);
7549 return 0;
7550
7551onError:
7552 Py_XDECREF(item);
7553 Py_XDECREF(errorHandler);
7554 Py_XDECREF(exc);
7555 return -1;
7556}
7557
Alexander Belopolsky40018472011-02-26 01:02:56 +00007558PyObject *
7559PyUnicode_DecodeCharmap(const char *s,
7560 Py_ssize_t size,
7561 PyObject *mapping,
7562 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007564 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007565
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 /* Default to Latin-1 */
7567 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007571 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007572 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007573 writer.min_length = size;
7574 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007576
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007577 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007578 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7579 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007580 }
7581 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007582 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007585 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007586
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007588 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 return NULL;
7590}
7591
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007592/* Charmap encoding: the lookup table */
7593
Alexander Belopolsky40018472011-02-26 01:02:56 +00007594struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 PyObject_HEAD
7596 unsigned char level1[32];
7597 int count2, count3;
7598 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007599};
7600
7601static PyObject*
7602encoding_map_size(PyObject *obj, PyObject* args)
7603{
7604 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007605 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007607}
7608
7609static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007610 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 PyDoc_STR("Return the size (in bytes) of this object") },
7612 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007613};
7614
7615static void
7616encoding_map_dealloc(PyObject* o)
7617{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007618 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619}
7620
7621static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007622 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 "EncodingMap", /*tp_name*/
7624 sizeof(struct encoding_map), /*tp_basicsize*/
7625 0, /*tp_itemsize*/
7626 /* methods */
7627 encoding_map_dealloc, /*tp_dealloc*/
7628 0, /*tp_print*/
7629 0, /*tp_getattr*/
7630 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007631 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 0, /*tp_repr*/
7633 0, /*tp_as_number*/
7634 0, /*tp_as_sequence*/
7635 0, /*tp_as_mapping*/
7636 0, /*tp_hash*/
7637 0, /*tp_call*/
7638 0, /*tp_str*/
7639 0, /*tp_getattro*/
7640 0, /*tp_setattro*/
7641 0, /*tp_as_buffer*/
7642 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7643 0, /*tp_doc*/
7644 0, /*tp_traverse*/
7645 0, /*tp_clear*/
7646 0, /*tp_richcompare*/
7647 0, /*tp_weaklistoffset*/
7648 0, /*tp_iter*/
7649 0, /*tp_iternext*/
7650 encoding_map_methods, /*tp_methods*/
7651 0, /*tp_members*/
7652 0, /*tp_getset*/
7653 0, /*tp_base*/
7654 0, /*tp_dict*/
7655 0, /*tp_descr_get*/
7656 0, /*tp_descr_set*/
7657 0, /*tp_dictoffset*/
7658 0, /*tp_init*/
7659 0, /*tp_alloc*/
7660 0, /*tp_new*/
7661 0, /*tp_free*/
7662 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007663};
7664
7665PyObject*
7666PyUnicode_BuildEncodingMap(PyObject* string)
7667{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007668 PyObject *result;
7669 struct encoding_map *mresult;
7670 int i;
7671 int need_dict = 0;
7672 unsigned char level1[32];
7673 unsigned char level2[512];
7674 unsigned char *mlevel1, *mlevel2, *mlevel3;
7675 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007676 int kind;
7677 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007678 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007679 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007680
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007681 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007682 PyErr_BadArgument();
7683 return NULL;
7684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007685 kind = PyUnicode_KIND(string);
7686 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007687 length = PyUnicode_GET_LENGTH(string);
7688 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007689 memset(level1, 0xFF, sizeof level1);
7690 memset(level2, 0xFF, sizeof level2);
7691
7692 /* If there isn't a one-to-one mapping of NULL to \0,
7693 or if there are non-BMP characters, we need to use
7694 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007695 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007696 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007697 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007698 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007699 ch = PyUnicode_READ(kind, data, i);
7700 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007701 need_dict = 1;
7702 break;
7703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007704 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007705 /* unmapped character */
7706 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007707 l1 = ch >> 11;
7708 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007709 if (level1[l1] == 0xFF)
7710 level1[l1] = count2++;
7711 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007712 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713 }
7714
7715 if (count2 >= 0xFF || count3 >= 0xFF)
7716 need_dict = 1;
7717
7718 if (need_dict) {
7719 PyObject *result = PyDict_New();
7720 PyObject *key, *value;
7721 if (!result)
7722 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007723 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007724 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007725 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007726 if (!key || !value)
7727 goto failed1;
7728 if (PyDict_SetItem(result, key, value) == -1)
7729 goto failed1;
7730 Py_DECREF(key);
7731 Py_DECREF(value);
7732 }
7733 return result;
7734 failed1:
7735 Py_XDECREF(key);
7736 Py_XDECREF(value);
7737 Py_DECREF(result);
7738 return NULL;
7739 }
7740
7741 /* Create a three-level trie */
7742 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7743 16*count2 + 128*count3 - 1);
7744 if (!result)
7745 return PyErr_NoMemory();
7746 PyObject_Init(result, &EncodingMapType);
7747 mresult = (struct encoding_map*)result;
7748 mresult->count2 = count2;
7749 mresult->count3 = count3;
7750 mlevel1 = mresult->level1;
7751 mlevel2 = mresult->level23;
7752 mlevel3 = mresult->level23 + 16*count2;
7753 memcpy(mlevel1, level1, 32);
7754 memset(mlevel2, 0xFF, 16*count2);
7755 memset(mlevel3, 0, 128*count3);
7756 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007757 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007759 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7760 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007761 /* unmapped character */
7762 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007763 o1 = ch>>11;
7764 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007765 i2 = 16*mlevel1[o1] + o2;
7766 if (mlevel2[i2] == 0xFF)
7767 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007768 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769 i3 = 128*mlevel2[i2] + o3;
7770 mlevel3[i3] = i;
7771 }
7772 return result;
7773}
7774
7775static int
Victor Stinner22168992011-11-20 17:09:18 +01007776encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777{
7778 struct encoding_map *map = (struct encoding_map*)mapping;
7779 int l1 = c>>11;
7780 int l2 = (c>>7) & 0xF;
7781 int l3 = c & 0x7F;
7782 int i;
7783
Victor Stinner22168992011-11-20 17:09:18 +01007784 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007786 if (c == 0)
7787 return 0;
7788 /* level 1*/
7789 i = map->level1[l1];
7790 if (i == 0xFF) {
7791 return -1;
7792 }
7793 /* level 2*/
7794 i = map->level23[16*i+l2];
7795 if (i == 0xFF) {
7796 return -1;
7797 }
7798 /* level 3 */
7799 i = map->level23[16*map->count2 + 128*i + l3];
7800 if (i == 0) {
7801 return -1;
7802 }
7803 return i;
7804}
7805
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007806/* Lookup the character ch in the mapping. If the character
7807 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007808 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007809static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007810charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811{
Christian Heimes217cfd12007-12-02 14:31:20 +00007812 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007813 PyObject *x;
7814
7815 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007817 x = PyObject_GetItem(mapping, w);
7818 Py_DECREF(w);
7819 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7821 /* No mapping found means: mapping is undefined. */
7822 PyErr_Clear();
7823 x = Py_None;
7824 Py_INCREF(x);
7825 return x;
7826 } else
7827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007829 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007831 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 long value = PyLong_AS_LONG(x);
7833 if (value < 0 || value > 255) {
7834 PyErr_SetString(PyExc_TypeError,
7835 "character mapping must be in range(256)");
7836 Py_DECREF(x);
7837 return NULL;
7838 }
7839 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007841 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 /* wrong return value */
7845 PyErr_Format(PyExc_TypeError,
7846 "character mapping must return integer, bytes or None, not %.400s",
7847 x->ob_type->tp_name);
7848 Py_DECREF(x);
7849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 }
7851}
7852
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007854charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007855{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007856 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7857 /* exponentially overallocate to minimize reallocations */
7858 if (requiredsize < 2*outsize)
7859 requiredsize = 2*outsize;
7860 if (_PyBytes_Resize(outobj, requiredsize))
7861 return -1;
7862 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863}
7864
Benjamin Peterson14339b62009-01-31 16:36:08 +00007865typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007867} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007869 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870 space is available. Return a new reference to the object that
7871 was put in the output buffer, or Py_None, if the mapping was undefined
7872 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007873 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007874static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007875charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007876 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007877{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878 PyObject *rep;
7879 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007880 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881
Christian Heimes90aa7642007-12-19 02:45:37 +00007882 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885 if (res == -1)
7886 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 if (outsize<requiredsize)
7888 if (charmapencode_resize(outobj, outpos, requiredsize))
7889 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007890 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 outstart[(*outpos)++] = (char)res;
7892 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 }
7894
7895 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007896 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007898 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 Py_DECREF(rep);
7900 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 if (PyLong_Check(rep)) {
7903 Py_ssize_t requiredsize = *outpos+1;
7904 if (outsize<requiredsize)
7905 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7906 Py_DECREF(rep);
7907 return enc_EXCEPTION;
7908 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007909 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007911 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 else {
7913 const char *repchars = PyBytes_AS_STRING(rep);
7914 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7915 Py_ssize_t requiredsize = *outpos+repsize;
7916 if (outsize<requiredsize)
7917 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7918 Py_DECREF(rep);
7919 return enc_EXCEPTION;
7920 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007921 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 memcpy(outstart + *outpos, repchars, repsize);
7923 *outpos += repsize;
7924 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007925 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 Py_DECREF(rep);
7927 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007928}
7929
7930/* handle an error in PyUnicode_EncodeCharmap
7931 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007932static int
7933charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007934 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007935 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007936 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007937 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938{
7939 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007940 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007941 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007942 enum PyUnicode_Kind kind;
7943 void *data;
7944 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007945 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007946 Py_ssize_t collstartpos = *inpos;
7947 Py_ssize_t collendpos = *inpos+1;
7948 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 char *encoding = "charmap";
7950 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007951 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007952 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007953 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954
Benjamin Petersonbac79492012-01-14 13:34:47 -05007955 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007956 return -1;
7957 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007958 /* find all unencodable characters */
7959 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007960 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007961 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007962 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007963 val = encoding_map_lookup(ch, mapping);
7964 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 break;
7966 ++collendpos;
7967 continue;
7968 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007969
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007970 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7971 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 if (rep==NULL)
7973 return -1;
7974 else if (rep!=Py_None) {
7975 Py_DECREF(rep);
7976 break;
7977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007978 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980 }
7981 /* cache callback name lookup
7982 * (if not done yet, i.e. it's the first error) */
7983 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 if ((errors==NULL) || (!strcmp(errors, "strict")))
7985 *known_errorHandler = 1;
7986 else if (!strcmp(errors, "replace"))
7987 *known_errorHandler = 2;
7988 else if (!strcmp(errors, "ignore"))
7989 *known_errorHandler = 3;
7990 else if (!strcmp(errors, "xmlcharrefreplace"))
7991 *known_errorHandler = 4;
7992 else
7993 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007994 }
7995 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007996 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007997 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007998 return -1;
7999 case 2: /* replace */
8000 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 x = charmapencode_output('?', mapping, res, respos);
8002 if (x==enc_EXCEPTION) {
8003 return -1;
8004 }
8005 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008006 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 return -1;
8008 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008009 }
8010 /* fall through */
8011 case 3: /* ignore */
8012 *inpos = collendpos;
8013 break;
8014 case 4: /* xmlcharrefreplace */
8015 /* generate replacement (temporarily (mis)uses p) */
8016 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 char buffer[2+29+1+1];
8018 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008019 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 for (cp = buffer; *cp; ++cp) {
8021 x = charmapencode_output(*cp, mapping, res, respos);
8022 if (x==enc_EXCEPTION)
8023 return -1;
8024 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008025 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 return -1;
8027 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008028 }
8029 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008030 *inpos = collendpos;
8031 break;
8032 default:
8033 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008034 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008038 if (PyBytes_Check(repunicode)) {
8039 /* Directly copy bytes result to output. */
8040 Py_ssize_t outsize = PyBytes_Size(*res);
8041 Py_ssize_t requiredsize;
8042 repsize = PyBytes_Size(repunicode);
8043 requiredsize = *respos + repsize;
8044 if (requiredsize > outsize)
8045 /* Make room for all additional bytes. */
8046 if (charmapencode_resize(res, respos, requiredsize)) {
8047 Py_DECREF(repunicode);
8048 return -1;
8049 }
8050 memcpy(PyBytes_AsString(*res) + *respos,
8051 PyBytes_AsString(repunicode), repsize);
8052 *respos += repsize;
8053 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008054 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008055 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008056 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008058 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008059 Py_DECREF(repunicode);
8060 return -1;
8061 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008062 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008063 data = PyUnicode_DATA(repunicode);
8064 kind = PyUnicode_KIND(repunicode);
8065 for (index = 0; index < repsize; index++) {
8066 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8067 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008069 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 return -1;
8071 }
8072 else if (x==enc_FAILED) {
8073 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008074 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 return -1;
8076 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008077 }
8078 *inpos = newpos;
8079 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 }
8081 return 0;
8082}
8083
Alexander Belopolsky40018472011-02-26 01:02:56 +00008084PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008085_PyUnicode_EncodeCharmap(PyObject *unicode,
8086 PyObject *mapping,
8087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 /* output object */
8090 PyObject *res = NULL;
8091 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008092 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008095 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 PyObject *errorHandler = NULL;
8097 PyObject *exc = NULL;
8098 /* the following variable is used for caching string comparisons
8099 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8100 * 3=ignore, 4=xmlcharrefreplace */
8101 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008102 void *data;
8103 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104
Benjamin Petersonbac79492012-01-14 13:34:47 -05008105 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008106 return NULL;
8107 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008108 data = PyUnicode_DATA(unicode);
8109 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008110
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 /* Default to Latin-1 */
8112 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008113 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008115 /* allocate enough for a simple encoding without
8116 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008117 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 if (res == NULL)
8119 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008120 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008123 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008124 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008126 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 if (x==enc_EXCEPTION) /* error */
8128 goto onError;
8129 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 &exc,
8132 &known_errorHandler, &errorHandler, errors,
8133 &res, &respos)) {
8134 goto onError;
8135 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008136 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 else
8138 /* done with this character => adjust input position */
8139 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008143 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008144 if (_PyBytes_Resize(&res, respos) < 0)
8145 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 Py_XDECREF(exc);
8148 Py_XDECREF(errorHandler);
8149 return res;
8150
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152 Py_XDECREF(res);
8153 Py_XDECREF(exc);
8154 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 return NULL;
8156}
8157
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008158/* Deprecated */
8159PyObject *
8160PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8161 Py_ssize_t size,
8162 PyObject *mapping,
8163 const char *errors)
8164{
8165 PyObject *result;
8166 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8167 if (unicode == NULL)
8168 return NULL;
8169 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8170 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008171 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008172}
8173
Alexander Belopolsky40018472011-02-26 01:02:56 +00008174PyObject *
8175PyUnicode_AsCharmapString(PyObject *unicode,
8176 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177{
8178 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 PyErr_BadArgument();
8180 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008182 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183}
8184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008185/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008186static void
8187make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008189 Py_ssize_t startpos, Py_ssize_t endpos,
8190 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008192 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008193 *exceptionObject = _PyUnicodeTranslateError_Create(
8194 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 }
8196 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8198 goto onError;
8199 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8200 goto onError;
8201 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8202 goto onError;
8203 return;
8204 onError:
8205 Py_DECREF(*exceptionObject);
8206 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207 }
8208}
8209
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210/* error handling callback helper:
8211 build arguments, call the callback and check the arguments,
8212 put the result into newpos and return the replacement string, which
8213 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008214static PyObject *
8215unicode_translate_call_errorhandler(const char *errors,
8216 PyObject **errorHandler,
8217 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008219 Py_ssize_t startpos, Py_ssize_t endpos,
8220 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008222 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008224 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225 PyObject *restuple;
8226 PyObject *resunicode;
8227
8228 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008230 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232 }
8233
8234 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238
8239 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008244 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 Py_DECREF(restuple);
8246 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 }
8248 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 &resunicode, &i_newpos)) {
8250 Py_DECREF(restuple);
8251 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008253 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008255 else
8256 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008257 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8259 Py_DECREF(restuple);
8260 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008261 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 Py_INCREF(resunicode);
8263 Py_DECREF(restuple);
8264 return resunicode;
8265}
8266
8267/* Lookup the character ch in the mapping and put the result in result,
8268 which must be decrefed by the caller.
8269 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008270static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272{
Christian Heimes217cfd12007-12-02 14:31:20 +00008273 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 PyObject *x;
8275
8276 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 x = PyObject_GetItem(mapping, w);
8279 Py_DECREF(w);
8280 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8282 /* No mapping found means: use 1:1 mapping. */
8283 PyErr_Clear();
8284 *result = NULL;
8285 return 0;
8286 } else
8287 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008288 }
8289 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 *result = x;
8291 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008293 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 long value = PyLong_AS_LONG(x);
8295 long max = PyUnicode_GetMax();
8296 if (value < 0 || value > max) {
8297 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008298 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 Py_DECREF(x);
8300 return -1;
8301 }
8302 *result = x;
8303 return 0;
8304 }
8305 else if (PyUnicode_Check(x)) {
8306 *result = x;
8307 return 0;
8308 }
8309 else {
8310 /* wrong return value */
8311 PyErr_SetString(PyExc_TypeError,
8312 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008313 Py_DECREF(x);
8314 return -1;
8315 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316}
8317/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 if not reallocate and adjust various state variables.
8319 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008320static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008321charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008325 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008326 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 /* exponentially overallocate to minimize reallocations */
8328 if (requiredsize < 2 * oldsize)
8329 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008330 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8331 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008333 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335 }
8336 return 0;
8337}
8338/* lookup the character, put the result in the output string and adjust
8339 various state variables. Return a new reference to the object that
8340 was put in the output buffer in *result, or Py_None, if the mapping was
8341 undefined (in which case no character was written).
8342 The called must decref result.
8343 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008344static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008345charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8346 PyObject *mapping, Py_UCS4 **output,
8347 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008348 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8351 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356 }
8357 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008359 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 }
8363 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 Py_ssize_t repsize;
8365 if (PyUnicode_READY(*res) == -1)
8366 return -1;
8367 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 if (repsize==1) {
8369 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 }
8372 else if (repsize!=0) {
8373 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 Py_ssize_t requiredsize = *opos +
8375 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 Py_ssize_t i;
8378 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 for(i = 0; i < repsize; i++)
8381 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383 }
8384 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 return 0;
8387}
8388
Alexander Belopolsky40018472011-02-26 01:02:56 +00008389PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390_PyUnicode_TranslateCharmap(PyObject *input,
8391 PyObject *mapping,
8392 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 /* input object */
8395 char *idata;
8396 Py_ssize_t size, i;
8397 int kind;
8398 /* output buffer */
8399 Py_UCS4 *output = NULL;
8400 Py_ssize_t osize;
8401 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 char *reason = "character maps to <undefined>";
8405 PyObject *errorHandler = NULL;
8406 PyObject *exc = NULL;
8407 /* the following variable is used for caching string comparisons
8408 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8409 * 3=ignore, 4=xmlcharrefreplace */
8410 int known_errorHandler = -1;
8411
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 PyErr_BadArgument();
8414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 if (PyUnicode_READY(input) == -1)
8418 return NULL;
8419 idata = (char*)PyUnicode_DATA(input);
8420 kind = PyUnicode_KIND(input);
8421 size = PyUnicode_GET_LENGTH(input);
8422 i = 0;
8423
8424 if (size == 0) {
8425 Py_INCREF(input);
8426 return input;
8427 }
8428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 /* allocate enough for a simple 1:1 translation without
8430 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431 osize = size;
8432 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8433 opos = 0;
8434 if (output == NULL) {
8435 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 /* try to encode it */
8441 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008442 if (charmaptranslate_output(input, i, mapping,
8443 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 Py_XDECREF(x);
8445 goto onError;
8446 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 else { /* untranslatable character */
8451 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8452 Py_ssize_t repsize;
8453 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008456 Py_ssize_t collstart = i;
8457 Py_ssize_t collend = i+1;
8458 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 while (collend < size) {
8462 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 goto onError;
8464 Py_XDECREF(x);
8465 if (x!=Py_None)
8466 break;
8467 ++collend;
8468 }
8469 /* cache callback name lookup
8470 * (if not done yet, i.e. it's the first error) */
8471 if (known_errorHandler==-1) {
8472 if ((errors==NULL) || (!strcmp(errors, "strict")))
8473 known_errorHandler = 1;
8474 else if (!strcmp(errors, "replace"))
8475 known_errorHandler = 2;
8476 else if (!strcmp(errors, "ignore"))
8477 known_errorHandler = 3;
8478 else if (!strcmp(errors, "xmlcharrefreplace"))
8479 known_errorHandler = 4;
8480 else
8481 known_errorHandler = 0;
8482 }
8483 switch (known_errorHandler) {
8484 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008485 make_translate_exception(&exc,
8486 input, collstart, collend, reason);
8487 if (exc != NULL)
8488 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008489 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 case 2: /* replace */
8491 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 for (coll = collstart; coll<collend; coll++)
8493 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 /* fall through */
8495 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 break;
8498 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499 /* generate replacement (temporarily (mis)uses i) */
8500 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 char buffer[2+29+1+1];
8502 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8504 if (charmaptranslate_makespace(&output, &osize,
8505 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 goto onError;
8507 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 break;
8512 default:
8513 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 reason, input, &exc,
8515 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008516 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008518 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008519 Py_DECREF(repunicode);
8520 goto onError;
8521 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 repsize = PyUnicode_GET_LENGTH(repunicode);
8524 if (charmaptranslate_makespace(&output, &osize,
8525 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 Py_DECREF(repunicode);
8527 goto onError;
8528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 for (uni2 = 0; repsize-->0; ++uni2)
8530 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8531 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008533 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008534 }
8535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8537 if (!res)
8538 goto onError;
8539 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 Py_XDECREF(exc);
8541 Py_XDECREF(errorHandler);
8542 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 Py_XDECREF(exc);
8547 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 return NULL;
8549}
8550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551/* Deprecated. Use PyUnicode_Translate instead. */
8552PyObject *
8553PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8554 Py_ssize_t size,
8555 PyObject *mapping,
8556 const char *errors)
8557{
Christian Heimes5f520f42012-09-11 14:03:25 +02008558 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8560 if (!unicode)
8561 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008562 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8563 Py_DECREF(unicode);
8564 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565}
8566
Alexander Belopolsky40018472011-02-26 01:02:56 +00008567PyObject *
8568PyUnicode_Translate(PyObject *str,
8569 PyObject *mapping,
8570 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571{
8572 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008573
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 str = PyUnicode_FromObject(str);
8575 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008576 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 Py_DECREF(str);
8579 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580}
Tim Petersced69f82003-09-16 20:30:58 +00008581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008583fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584{
8585 /* No need to call PyUnicode_READY(self) because this function is only
8586 called as a callback from fixup() which does it already. */
8587 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8588 const int kind = PyUnicode_KIND(self);
8589 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008590 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008591 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 Py_ssize_t i;
8593
8594 for (i = 0; i < len; ++i) {
8595 ch = PyUnicode_READ(kind, data, i);
8596 fixed = 0;
8597 if (ch > 127) {
8598 if (Py_UNICODE_ISSPACE(ch))
8599 fixed = ' ';
8600 else {
8601 const int decimal = Py_UNICODE_TODECIMAL(ch);
8602 if (decimal >= 0)
8603 fixed = '0' + decimal;
8604 }
8605 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008606 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008607 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 PyUnicode_WRITE(kind, data, i, fixed);
8609 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008610 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008611 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 }
8614
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008615 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616}
8617
8618PyObject *
8619_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8620{
8621 if (!PyUnicode_Check(unicode)) {
8622 PyErr_BadInternalCall();
8623 return NULL;
8624 }
8625 if (PyUnicode_READY(unicode) == -1)
8626 return NULL;
8627 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8628 /* If the string is already ASCII, just return the same string */
8629 Py_INCREF(unicode);
8630 return unicode;
8631 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008632 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633}
8634
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008635PyObject *
8636PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8637 Py_ssize_t length)
8638{
Victor Stinnerf0124502011-11-21 23:12:56 +01008639 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008640 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008641 Py_UCS4 maxchar;
8642 enum PyUnicode_Kind kind;
8643 void *data;
8644
Victor Stinner99d7ad02012-02-22 13:37:39 +01008645 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008646 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008647 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008648 if (ch > 127) {
8649 int decimal = Py_UNICODE_TODECIMAL(ch);
8650 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008651 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008652 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008653 }
8654 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008655
8656 /* Copy to a new string */
8657 decimal = PyUnicode_New(length, maxchar);
8658 if (decimal == NULL)
8659 return decimal;
8660 kind = PyUnicode_KIND(decimal);
8661 data = PyUnicode_DATA(decimal);
8662 /* Iterate over code points */
8663 for (i = 0; i < length; i++) {
8664 Py_UNICODE ch = s[i];
8665 if (ch > 127) {
8666 int decimal = Py_UNICODE_TODECIMAL(ch);
8667 if (decimal >= 0)
8668 ch = '0' + decimal;
8669 }
8670 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008672 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008673}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008674/* --- Decimal Encoder ---------------------------------------------------- */
8675
Alexander Belopolsky40018472011-02-26 01:02:56 +00008676int
8677PyUnicode_EncodeDecimal(Py_UNICODE *s,
8678 Py_ssize_t length,
8679 char *output,
8680 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008681{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008682 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008683 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008684 enum PyUnicode_Kind kind;
8685 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008686
8687 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 PyErr_BadArgument();
8689 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008690 }
8691
Victor Stinner42bf7752011-11-21 22:52:58 +01008692 unicode = PyUnicode_FromUnicode(s, length);
8693 if (unicode == NULL)
8694 return -1;
8695
Benjamin Petersonbac79492012-01-14 13:34:47 -05008696 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008697 Py_DECREF(unicode);
8698 return -1;
8699 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008700 kind = PyUnicode_KIND(unicode);
8701 data = PyUnicode_DATA(unicode);
8702
Victor Stinnerb84d7232011-11-22 01:50:07 +01008703 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008704 PyObject *exc;
8705 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008707 Py_ssize_t startpos;
8708
8709 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008710
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008712 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008713 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008715 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 decimal = Py_UNICODE_TODECIMAL(ch);
8717 if (decimal >= 0) {
8718 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008719 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 continue;
8721 }
8722 if (0 < ch && ch < 256) {
8723 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008724 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 continue;
8726 }
Victor Stinner6345be92011-11-25 20:09:01 +01008727
Victor Stinner42bf7752011-11-21 22:52:58 +01008728 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008729 exc = NULL;
8730 raise_encode_exception(&exc, "decimal", unicode,
8731 startpos, startpos+1,
8732 "invalid decimal Unicode string");
8733 Py_XDECREF(exc);
8734 Py_DECREF(unicode);
8735 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008736 }
8737 /* 0-terminate the output string */
8738 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008739 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008740 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008741}
8742
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743/* --- Helpers ------------------------------------------------------------ */
8744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008746any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 Py_ssize_t start,
8748 Py_ssize_t end)
8749{
8750 int kind1, kind2, kind;
8751 void *buf1, *buf2;
8752 Py_ssize_t len1, len2, result;
8753
8754 kind1 = PyUnicode_KIND(s1);
8755 kind2 = PyUnicode_KIND(s2);
8756 kind = kind1 > kind2 ? kind1 : kind2;
8757 buf1 = PyUnicode_DATA(s1);
8758 buf2 = PyUnicode_DATA(s2);
8759 if (kind1 != kind)
8760 buf1 = _PyUnicode_AsKind(s1, kind);
8761 if (!buf1)
8762 return -2;
8763 if (kind2 != kind)
8764 buf2 = _PyUnicode_AsKind(s2, kind);
8765 if (!buf2) {
8766 if (kind1 != kind) PyMem_Free(buf1);
8767 return -2;
8768 }
8769 len1 = PyUnicode_GET_LENGTH(s1);
8770 len2 = PyUnicode_GET_LENGTH(s2);
8771
Victor Stinner794d5672011-10-10 03:21:36 +02008772 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008773 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008774 case PyUnicode_1BYTE_KIND:
8775 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8776 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8777 else
8778 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8779 break;
8780 case PyUnicode_2BYTE_KIND:
8781 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8782 break;
8783 case PyUnicode_4BYTE_KIND:
8784 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8785 break;
8786 default:
8787 assert(0); result = -2;
8788 }
8789 }
8790 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008791 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008792 case PyUnicode_1BYTE_KIND:
8793 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8794 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8795 else
8796 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8797 break;
8798 case PyUnicode_2BYTE_KIND:
8799 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8800 break;
8801 case PyUnicode_4BYTE_KIND:
8802 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8803 break;
8804 default:
8805 assert(0); result = -2;
8806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008807 }
8808
8809 if (kind1 != kind)
8810 PyMem_Free(buf1);
8811 if (kind2 != kind)
8812 PyMem_Free(buf2);
8813
8814 return result;
8815}
8816
8817Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008818_PyUnicode_InsertThousandsGrouping(
8819 PyObject *unicode, Py_ssize_t index,
8820 Py_ssize_t n_buffer,
8821 void *digits, Py_ssize_t n_digits,
8822 Py_ssize_t min_width,
8823 const char *grouping, PyObject *thousands_sep,
8824 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825{
Victor Stinner41a863c2012-02-24 00:37:51 +01008826 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008827 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008828 Py_ssize_t thousands_sep_len;
8829 Py_ssize_t len;
8830
8831 if (unicode != NULL) {
8832 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008833 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008834 }
8835 else {
8836 kind = PyUnicode_1BYTE_KIND;
8837 data = NULL;
8838 }
8839 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8840 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8841 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8842 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008843 if (thousands_sep_kind < kind) {
8844 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8845 if (!thousands_sep_data)
8846 return -1;
8847 }
8848 else {
8849 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8850 if (!data)
8851 return -1;
8852 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008853 }
8854
Benjamin Petersonead6b532011-12-20 17:23:42 -06008855 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008857 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008858 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008859 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008860 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008861 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008862 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008863 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008864 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008865 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008866 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008867 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008869 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008870 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008871 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008872 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008877 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008878 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008879 break;
8880 default:
8881 assert(0);
8882 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008884 if (unicode != NULL && thousands_sep_kind != kind) {
8885 if (thousands_sep_kind < kind)
8886 PyMem_Free(thousands_sep_data);
8887 else
8888 PyMem_Free(data);
8889 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008890 if (unicode == NULL) {
8891 *maxchar = 127;
8892 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008893 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008894 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008895 }
8896 }
8897 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898}
8899
8900
Thomas Wouters477c8d52006-05-27 19:21:47 +00008901/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008902#define ADJUST_INDICES(start, end, len) \
8903 if (end > len) \
8904 end = len; \
8905 else if (end < 0) { \
8906 end += len; \
8907 if (end < 0) \
8908 end = 0; \
8909 } \
8910 if (start < 0) { \
8911 start += len; \
8912 if (start < 0) \
8913 start = 0; \
8914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008915
Alexander Belopolsky40018472011-02-26 01:02:56 +00008916Py_ssize_t
8917PyUnicode_Count(PyObject *str,
8918 PyObject *substr,
8919 Py_ssize_t start,
8920 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008922 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008923 PyObject* str_obj;
8924 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 int kind1, kind2, kind;
8926 void *buf1 = NULL, *buf2 = NULL;
8927 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008928
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008929 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008930 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008932 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008933 if (!sub_obj) {
8934 Py_DECREF(str_obj);
8935 return -1;
8936 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008937 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008938 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 Py_DECREF(str_obj);
8940 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 }
Tim Petersced69f82003-09-16 20:30:58 +00008942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 kind1 = PyUnicode_KIND(str_obj);
8944 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008945 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008948 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008949 if (kind2 > kind) {
8950 Py_DECREF(sub_obj);
8951 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008952 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008953 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008954 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 if (!buf2)
8957 goto onError;
8958 len1 = PyUnicode_GET_LENGTH(str_obj);
8959 len2 = PyUnicode_GET_LENGTH(sub_obj);
8960
8961 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008962 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008964 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8965 result = asciilib_count(
8966 ((Py_UCS1*)buf1) + start, end - start,
8967 buf2, len2, PY_SSIZE_T_MAX
8968 );
8969 else
8970 result = ucs1lib_count(
8971 ((Py_UCS1*)buf1) + start, end - start,
8972 buf2, len2, PY_SSIZE_T_MAX
8973 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 break;
8975 case PyUnicode_2BYTE_KIND:
8976 result = ucs2lib_count(
8977 ((Py_UCS2*)buf1) + start, end - start,
8978 buf2, len2, PY_SSIZE_T_MAX
8979 );
8980 break;
8981 case PyUnicode_4BYTE_KIND:
8982 result = ucs4lib_count(
8983 ((Py_UCS4*)buf1) + start, end - start,
8984 buf2, len2, PY_SSIZE_T_MAX
8985 );
8986 break;
8987 default:
8988 assert(0); result = 0;
8989 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008990
8991 Py_DECREF(sub_obj);
8992 Py_DECREF(str_obj);
8993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 if (kind2 != kind)
8995 PyMem_Free(buf2);
8996
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998 onError:
8999 Py_DECREF(sub_obj);
9000 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 if (kind2 != kind && buf2)
9002 PyMem_Free(buf2);
9003 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004}
9005
Alexander Belopolsky40018472011-02-26 01:02:56 +00009006Py_ssize_t
9007PyUnicode_Find(PyObject *str,
9008 PyObject *sub,
9009 Py_ssize_t start,
9010 Py_ssize_t end,
9011 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009013 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009014
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009016 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009018 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009019 if (!sub) {
9020 Py_DECREF(str);
9021 return -2;
9022 }
9023 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9024 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 Py_DECREF(str);
9026 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 }
Tim Petersced69f82003-09-16 20:30:58 +00009028
Victor Stinner794d5672011-10-10 03:21:36 +02009029 result = any_find_slice(direction,
9030 str, sub, start, end
9031 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009032
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009034 Py_DECREF(sub);
9035
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 return result;
9037}
9038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039Py_ssize_t
9040PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9041 Py_ssize_t start, Py_ssize_t end,
9042 int direction)
9043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009045 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 if (PyUnicode_READY(str) == -1)
9047 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009048 if (start < 0 || end < 0) {
9049 PyErr_SetString(PyExc_IndexError, "string index out of range");
9050 return -2;
9051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (end > PyUnicode_GET_LENGTH(str))
9053 end = PyUnicode_GET_LENGTH(str);
9054 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009055 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9056 kind, end-start, ch, direction);
9057 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009059 else
9060 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061}
9062
Alexander Belopolsky40018472011-02-26 01:02:56 +00009063static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009064tailmatch(PyObject *self,
9065 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009066 Py_ssize_t start,
9067 Py_ssize_t end,
9068 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 int kind_self;
9071 int kind_sub;
9072 void *data_self;
9073 void *data_sub;
9074 Py_ssize_t offset;
9075 Py_ssize_t i;
9076 Py_ssize_t end_sub;
9077
9078 if (PyUnicode_READY(self) == -1 ||
9079 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009080 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081
9082 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 return 1;
9084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9086 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 kind_self = PyUnicode_KIND(self);
9091 data_self = PyUnicode_DATA(self);
9092 kind_sub = PyUnicode_KIND(substring);
9093 data_sub = PyUnicode_DATA(substring);
9094 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9095
9096 if (direction > 0)
9097 offset = end;
9098 else
9099 offset = start;
9100
9101 if (PyUnicode_READ(kind_self, data_self, offset) ==
9102 PyUnicode_READ(kind_sub, data_sub, 0) &&
9103 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9104 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9105 /* If both are of the same kind, memcmp is sufficient */
9106 if (kind_self == kind_sub) {
9107 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009108 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 data_sub,
9110 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009111 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 }
9113 /* otherwise we have to compare each character by first accesing it */
9114 else {
9115 /* We do not need to compare 0 and len(substring)-1 because
9116 the if statement above ensured already that they are equal
9117 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 for (i = 1; i < end_sub; ++i) {
9119 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9120 PyUnicode_READ(kind_sub, data_sub, i))
9121 return 0;
9122 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 }
9126
9127 return 0;
9128}
9129
Alexander Belopolsky40018472011-02-26 01:02:56 +00009130Py_ssize_t
9131PyUnicode_Tailmatch(PyObject *str,
9132 PyObject *substr,
9133 Py_ssize_t start,
9134 Py_ssize_t end,
9135 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009137 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009138
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139 str = PyUnicode_FromObject(str);
9140 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 substr = PyUnicode_FromObject(substr);
9143 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 Py_DECREF(str);
9145 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 }
Tim Petersced69f82003-09-16 20:30:58 +00009147
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009148 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 Py_DECREF(str);
9151 Py_DECREF(substr);
9152 return result;
9153}
9154
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155/* Apply fixfct filter to the Unicode object self and return a
9156 reference to the modified object */
9157
Alexander Belopolsky40018472011-02-26 01:02:56 +00009158static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009159fixup(PyObject *self,
9160 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 PyObject *u;
9163 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009164 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009166 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009169 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 /* fix functions return the new maximum character in a string,
9172 if the kind of the resulting unicode object does not change,
9173 everything is fine. Otherwise we need to change the string kind
9174 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009175 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009176
9177 if (maxchar_new == 0) {
9178 /* no changes */;
9179 if (PyUnicode_CheckExact(self)) {
9180 Py_DECREF(u);
9181 Py_INCREF(self);
9182 return self;
9183 }
9184 else
9185 return u;
9186 }
9187
Victor Stinnere6abb482012-05-02 01:15:40 +02009188 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189
Victor Stinnereaab6042011-12-11 22:22:39 +01009190 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009192
9193 /* In case the maximum character changed, we need to
9194 convert the string to the new category. */
9195 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9196 if (v == NULL) {
9197 Py_DECREF(u);
9198 return NULL;
9199 }
9200 if (maxchar_new > maxchar_old) {
9201 /* If the maxchar increased so that the kind changed, not all
9202 characters are representable anymore and we need to fix the
9203 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009204 _PyUnicode_FastCopyCharacters(v, 0,
9205 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009206 maxchar_old = fixfct(v);
9207 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208 }
9209 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009210 _PyUnicode_FastCopyCharacters(v, 0,
9211 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009213 Py_DECREF(u);
9214 assert(_PyUnicode_CheckConsistency(v, 1));
9215 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216}
9217
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009218static PyObject *
9219ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009221 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9222 char *resdata, *data = PyUnicode_DATA(self);
9223 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009224
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009225 res = PyUnicode_New(len, 127);
9226 if (res == NULL)
9227 return NULL;
9228 resdata = PyUnicode_DATA(res);
9229 if (lower)
9230 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009232 _Py_bytes_upper(resdata, data, len);
9233 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234}
9235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009237handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009239 Py_ssize_t j;
9240 int final_sigma;
9241 Py_UCS4 c;
9242 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009243
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009244 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9245
9246 where ! is a negation and \p{xxx} is a character with property xxx.
9247 */
9248 for (j = i - 1; j >= 0; j--) {
9249 c = PyUnicode_READ(kind, data, j);
9250 if (!_PyUnicode_IsCaseIgnorable(c))
9251 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009253 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9254 if (final_sigma) {
9255 for (j = i + 1; j < length; j++) {
9256 c = PyUnicode_READ(kind, data, j);
9257 if (!_PyUnicode_IsCaseIgnorable(c))
9258 break;
9259 }
9260 final_sigma = j == length || !_PyUnicode_IsCased(c);
9261 }
9262 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263}
9264
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009265static int
9266lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9267 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009269 /* Obscure special case. */
9270 if (c == 0x3A3) {
9271 mapped[0] = handle_capital_sigma(kind, data, length, i);
9272 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009274 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275}
9276
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009277static Py_ssize_t
9278do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009280 Py_ssize_t i, k = 0;
9281 int n_res, j;
9282 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009283
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009284 c = PyUnicode_READ(kind, data, 0);
9285 n_res = _PyUnicode_ToUpperFull(c, mapped);
9286 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009287 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009288 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009290 for (i = 1; i < length; i++) {
9291 c = PyUnicode_READ(kind, data, i);
9292 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9293 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009294 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009295 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009296 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009297 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009298 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299}
9300
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009301static Py_ssize_t
9302do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9303 Py_ssize_t i, k = 0;
9304
9305 for (i = 0; i < length; i++) {
9306 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9307 int n_res, j;
9308 if (Py_UNICODE_ISUPPER(c)) {
9309 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9310 }
9311 else if (Py_UNICODE_ISLOWER(c)) {
9312 n_res = _PyUnicode_ToUpperFull(c, mapped);
9313 }
9314 else {
9315 n_res = 1;
9316 mapped[0] = c;
9317 }
9318 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009319 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009320 res[k++] = mapped[j];
9321 }
9322 }
9323 return k;
9324}
9325
9326static Py_ssize_t
9327do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9328 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009330 Py_ssize_t i, k = 0;
9331
9332 for (i = 0; i < length; i++) {
9333 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9334 int n_res, j;
9335 if (lower)
9336 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9337 else
9338 n_res = _PyUnicode_ToUpperFull(c, mapped);
9339 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009340 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009341 res[k++] = mapped[j];
9342 }
9343 }
9344 return k;
9345}
9346
9347static Py_ssize_t
9348do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9349{
9350 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9351}
9352
9353static Py_ssize_t
9354do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9355{
9356 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9357}
9358
Benjamin Petersone51757f2012-01-12 21:10:29 -05009359static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009360do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9361{
9362 Py_ssize_t i, k = 0;
9363
9364 for (i = 0; i < length; i++) {
9365 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9366 Py_UCS4 mapped[3];
9367 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9368 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009369 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009370 res[k++] = mapped[j];
9371 }
9372 }
9373 return k;
9374}
9375
9376static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009377do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9378{
9379 Py_ssize_t i, k = 0;
9380 int previous_is_cased;
9381
9382 previous_is_cased = 0;
9383 for (i = 0; i < length; i++) {
9384 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9385 Py_UCS4 mapped[3];
9386 int n_res, j;
9387
9388 if (previous_is_cased)
9389 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9390 else
9391 n_res = _PyUnicode_ToTitleFull(c, mapped);
9392
9393 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009394 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009395 res[k++] = mapped[j];
9396 }
9397
9398 previous_is_cased = _PyUnicode_IsCased(c);
9399 }
9400 return k;
9401}
9402
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009403static PyObject *
9404case_operation(PyObject *self,
9405 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9406{
9407 PyObject *res = NULL;
9408 Py_ssize_t length, newlength = 0;
9409 int kind, outkind;
9410 void *data, *outdata;
9411 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9412
Benjamin Petersoneea48462012-01-16 14:28:50 -05009413 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009414
9415 kind = PyUnicode_KIND(self);
9416 data = PyUnicode_DATA(self);
9417 length = PyUnicode_GET_LENGTH(self);
9418 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9419 if (tmp == NULL)
9420 return PyErr_NoMemory();
9421 newlength = perform(kind, data, length, tmp, &maxchar);
9422 res = PyUnicode_New(newlength, maxchar);
9423 if (res == NULL)
9424 goto leave;
9425 tmpend = tmp + newlength;
9426 outdata = PyUnicode_DATA(res);
9427 outkind = PyUnicode_KIND(res);
9428 switch (outkind) {
9429 case PyUnicode_1BYTE_KIND:
9430 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9431 break;
9432 case PyUnicode_2BYTE_KIND:
9433 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9434 break;
9435 case PyUnicode_4BYTE_KIND:
9436 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9437 break;
9438 default:
9439 assert(0);
9440 break;
9441 }
9442 leave:
9443 PyMem_FREE(tmp);
9444 return res;
9445}
9446
Tim Peters8ce9f162004-08-27 01:49:32 +00009447PyObject *
9448PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009451 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009453 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009454 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9455 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009456 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009458 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009460 int use_memcpy;
9461 unsigned char *res_data = NULL, *sep_data = NULL;
9462 PyObject *last_obj;
9463 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464
Tim Peters05eba1f2004-08-27 21:32:02 +00009465 fseq = PySequence_Fast(seq, "");
9466 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009467 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009468 }
9469
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009470 /* NOTE: the following code can't call back into Python code,
9471 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009472 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009473
Tim Peters05eba1f2004-08-27 21:32:02 +00009474 seqlen = PySequence_Fast_GET_SIZE(fseq);
9475 /* If empty sequence, return u"". */
9476 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009477 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009478 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009479 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009480
Tim Peters05eba1f2004-08-27 21:32:02 +00009481 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009482 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009483 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009484 if (seqlen == 1) {
9485 if (PyUnicode_CheckExact(items[0])) {
9486 res = items[0];
9487 Py_INCREF(res);
9488 Py_DECREF(fseq);
9489 return res;
9490 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009491 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009492 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009493 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009494 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009495 /* Set up sep and seplen */
9496 if (separator == NULL) {
9497 /* fall back to a blank space separator */
9498 sep = PyUnicode_FromOrdinal(' ');
9499 if (!sep)
9500 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009501 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009502 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009503 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009504 else {
9505 if (!PyUnicode_Check(separator)) {
9506 PyErr_Format(PyExc_TypeError,
9507 "separator: expected str instance,"
9508 " %.80s found",
9509 Py_TYPE(separator)->tp_name);
9510 goto onError;
9511 }
9512 if (PyUnicode_READY(separator))
9513 goto onError;
9514 sep = separator;
9515 seplen = PyUnicode_GET_LENGTH(separator);
9516 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9517 /* inc refcount to keep this code path symmetric with the
9518 above case of a blank separator */
9519 Py_INCREF(sep);
9520 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009521 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009522 }
9523
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009524 /* There are at least two things to join, or else we have a subclass
9525 * of str in the sequence.
9526 * Do a pre-pass to figure out the total amount of space we'll
9527 * need (sz), and see whether all argument are strings.
9528 */
9529 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009530#ifdef Py_DEBUG
9531 use_memcpy = 0;
9532#else
9533 use_memcpy = 1;
9534#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009535 for (i = 0; i < seqlen; i++) {
9536 const Py_ssize_t old_sz = sz;
9537 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 if (!PyUnicode_Check(item)) {
9539 PyErr_Format(PyExc_TypeError,
9540 "sequence item %zd: expected str instance,"
9541 " %.80s found",
9542 i, Py_TYPE(item)->tp_name);
9543 goto onError;
9544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 if (PyUnicode_READY(item) == -1)
9546 goto onError;
9547 sz += PyUnicode_GET_LENGTH(item);
9548 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009549 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009550 if (i != 0)
9551 sz += seplen;
9552 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9553 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009555 goto onError;
9556 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009557 if (use_memcpy && last_obj != NULL) {
9558 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9559 use_memcpy = 0;
9560 }
9561 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009562 }
Tim Petersced69f82003-09-16 20:30:58 +00009563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009565 if (res == NULL)
9566 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009567
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009568 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009569#ifdef Py_DEBUG
9570 use_memcpy = 0;
9571#else
9572 if (use_memcpy) {
9573 res_data = PyUnicode_1BYTE_DATA(res);
9574 kind = PyUnicode_KIND(res);
9575 if (seplen != 0)
9576 sep_data = PyUnicode_1BYTE_DATA(sep);
9577 }
9578#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009579 if (use_memcpy) {
9580 for (i = 0; i < seqlen; ++i) {
9581 Py_ssize_t itemlen;
9582 item = items[i];
9583
9584 /* Copy item, and maybe the separator. */
9585 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009586 Py_MEMCPY(res_data,
9587 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009588 kind * seplen);
9589 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009590 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009591
9592 itemlen = PyUnicode_GET_LENGTH(item);
9593 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009594 Py_MEMCPY(res_data,
9595 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009596 kind * itemlen);
9597 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009598 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009599 }
9600 assert(res_data == PyUnicode_1BYTE_DATA(res)
9601 + kind * PyUnicode_GET_LENGTH(res));
9602 }
9603 else {
9604 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9605 Py_ssize_t itemlen;
9606 item = items[i];
9607
9608 /* Copy item, and maybe the separator. */
9609 if (i && seplen != 0) {
9610 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9611 res_offset += seplen;
9612 }
9613
9614 itemlen = PyUnicode_GET_LENGTH(item);
9615 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009616 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009617 res_offset += itemlen;
9618 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009619 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009620 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009621 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009622
Tim Peters05eba1f2004-08-27 21:32:02 +00009623 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009625 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009629 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009631 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632 return NULL;
9633}
9634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635#define FILL(kind, data, value, start, length) \
9636 do { \
9637 Py_ssize_t i_ = 0; \
9638 assert(kind != PyUnicode_WCHAR_KIND); \
9639 switch ((kind)) { \
9640 case PyUnicode_1BYTE_KIND: { \
9641 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009642 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 break; \
9644 } \
9645 case PyUnicode_2BYTE_KIND: { \
9646 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9647 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9648 break; \
9649 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009650 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9652 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9653 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009654 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 } \
9656 } \
9657 } while (0)
9658
Victor Stinnerd3f08822012-05-29 12:57:52 +02009659void
9660_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9661 Py_UCS4 fill_char)
9662{
9663 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9664 const void *data = PyUnicode_DATA(unicode);
9665 assert(PyUnicode_IS_READY(unicode));
9666 assert(unicode_modifiable(unicode));
9667 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9668 assert(start >= 0);
9669 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9670 FILL(kind, data, fill_char, start, length);
9671}
9672
Victor Stinner3fe55312012-01-04 00:33:50 +01009673Py_ssize_t
9674PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9675 Py_UCS4 fill_char)
9676{
9677 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009678
9679 if (!PyUnicode_Check(unicode)) {
9680 PyErr_BadInternalCall();
9681 return -1;
9682 }
9683 if (PyUnicode_READY(unicode) == -1)
9684 return -1;
9685 if (unicode_check_modifiable(unicode))
9686 return -1;
9687
Victor Stinnerd3f08822012-05-29 12:57:52 +02009688 if (start < 0) {
9689 PyErr_SetString(PyExc_IndexError, "string index out of range");
9690 return -1;
9691 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009692 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9693 PyErr_SetString(PyExc_ValueError,
9694 "fill character is bigger than "
9695 "the string maximum character");
9696 return -1;
9697 }
9698
9699 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9700 length = Py_MIN(maxlen, length);
9701 if (length <= 0)
9702 return 0;
9703
Victor Stinnerd3f08822012-05-29 12:57:52 +02009704 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009705 return length;
9706}
9707
Victor Stinner9310abb2011-10-05 00:59:23 +02009708static PyObject *
9709pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009710 Py_ssize_t left,
9711 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 PyObject *u;
9715 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009716 int kind;
9717 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718
9719 if (left < 0)
9720 left = 0;
9721 if (right < 0)
9722 right = 0;
9723
Victor Stinnerc4b49542011-12-11 22:44:26 +01009724 if (left == 0 && right == 0)
9725 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9728 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009729 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9730 return NULL;
9731 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009733 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009735 if (!u)
9736 return NULL;
9737
9738 kind = PyUnicode_KIND(u);
9739 data = PyUnicode_DATA(u);
9740 if (left)
9741 FILL(kind, data, fill, 0, left);
9742 if (right)
9743 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009744 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009745 assert(_PyUnicode_CheckConsistency(u, 1));
9746 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747}
9748
Alexander Belopolsky40018472011-02-26 01:02:56 +00009749PyObject *
9750PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753
9754 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009755 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009756 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009757 if (PyUnicode_READY(string) == -1) {
9758 Py_DECREF(string);
9759 return NULL;
9760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761
Benjamin Petersonead6b532011-12-20 17:23:42 -06009762 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009764 if (PyUnicode_IS_ASCII(string))
9765 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009766 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009767 PyUnicode_GET_LENGTH(string), keepends);
9768 else
9769 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009770 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009771 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 break;
9773 case PyUnicode_2BYTE_KIND:
9774 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009775 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 PyUnicode_GET_LENGTH(string), keepends);
9777 break;
9778 case PyUnicode_4BYTE_KIND:
9779 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009780 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 PyUnicode_GET_LENGTH(string), keepends);
9782 break;
9783 default:
9784 assert(0);
9785 list = 0;
9786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787 Py_DECREF(string);
9788 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789}
9790
Alexander Belopolsky40018472011-02-26 01:02:56 +00009791static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009792split(PyObject *self,
9793 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009794 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 int kind1, kind2, kind;
9797 void *buf1, *buf2;
9798 Py_ssize_t len1, len2;
9799 PyObject* out;
9800
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009802 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 if (PyUnicode_READY(self) == -1)
9805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009808 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009810 if (PyUnicode_IS_ASCII(self))
9811 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009812 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009813 PyUnicode_GET_LENGTH(self), maxcount
9814 );
9815 else
9816 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009817 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009818 PyUnicode_GET_LENGTH(self), maxcount
9819 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 case PyUnicode_2BYTE_KIND:
9821 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009822 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 PyUnicode_GET_LENGTH(self), maxcount
9824 );
9825 case PyUnicode_4BYTE_KIND:
9826 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009827 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 PyUnicode_GET_LENGTH(self), maxcount
9829 );
9830 default:
9831 assert(0);
9832 return NULL;
9833 }
9834
9835 if (PyUnicode_READY(substring) == -1)
9836 return NULL;
9837
9838 kind1 = PyUnicode_KIND(self);
9839 kind2 = PyUnicode_KIND(substring);
9840 kind = kind1 > kind2 ? kind1 : kind2;
9841 buf1 = PyUnicode_DATA(self);
9842 buf2 = PyUnicode_DATA(substring);
9843 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009844 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 if (!buf1)
9846 return NULL;
9847 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009848 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 if (!buf2) {
9850 if (kind1 != kind) PyMem_Free(buf1);
9851 return NULL;
9852 }
9853 len1 = PyUnicode_GET_LENGTH(self);
9854 len2 = PyUnicode_GET_LENGTH(substring);
9855
Benjamin Petersonead6b532011-12-20 17:23:42 -06009856 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009858 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9859 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009860 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009861 else
9862 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009863 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 break;
9865 case PyUnicode_2BYTE_KIND:
9866 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009867 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 break;
9869 case PyUnicode_4BYTE_KIND:
9870 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009871 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 break;
9873 default:
9874 out = NULL;
9875 }
9876 if (kind1 != kind)
9877 PyMem_Free(buf1);
9878 if (kind2 != kind)
9879 PyMem_Free(buf2);
9880 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881}
9882
Alexander Belopolsky40018472011-02-26 01:02:56 +00009883static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009884rsplit(PyObject *self,
9885 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009886 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 int kind1, kind2, kind;
9889 void *buf1, *buf2;
9890 Py_ssize_t len1, len2;
9891 PyObject* out;
9892
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009893 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009894 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 if (PyUnicode_READY(self) == -1)
9897 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009900 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009902 if (PyUnicode_IS_ASCII(self))
9903 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009904 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009905 PyUnicode_GET_LENGTH(self), maxcount
9906 );
9907 else
9908 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009909 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009910 PyUnicode_GET_LENGTH(self), maxcount
9911 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 case PyUnicode_2BYTE_KIND:
9913 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009914 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 PyUnicode_GET_LENGTH(self), maxcount
9916 );
9917 case PyUnicode_4BYTE_KIND:
9918 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009919 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 PyUnicode_GET_LENGTH(self), maxcount
9921 );
9922 default:
9923 assert(0);
9924 return NULL;
9925 }
9926
9927 if (PyUnicode_READY(substring) == -1)
9928 return NULL;
9929
9930 kind1 = PyUnicode_KIND(self);
9931 kind2 = PyUnicode_KIND(substring);
9932 kind = kind1 > kind2 ? kind1 : kind2;
9933 buf1 = PyUnicode_DATA(self);
9934 buf2 = PyUnicode_DATA(substring);
9935 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009936 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 if (!buf1)
9938 return NULL;
9939 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009940 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 if (!buf2) {
9942 if (kind1 != kind) PyMem_Free(buf1);
9943 return NULL;
9944 }
9945 len1 = PyUnicode_GET_LENGTH(self);
9946 len2 = PyUnicode_GET_LENGTH(substring);
9947
Benjamin Petersonead6b532011-12-20 17:23:42 -06009948 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009950 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9951 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009952 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009953 else
9954 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009955 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 break;
9957 case PyUnicode_2BYTE_KIND:
9958 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009959 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 break;
9961 case PyUnicode_4BYTE_KIND:
9962 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 break;
9965 default:
9966 out = NULL;
9967 }
9968 if (kind1 != kind)
9969 PyMem_Free(buf1);
9970 if (kind2 != kind)
9971 PyMem_Free(buf2);
9972 return out;
9973}
9974
9975static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009976anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9977 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009979 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009981 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9982 return asciilib_find(buf1, len1, buf2, len2, offset);
9983 else
9984 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 case PyUnicode_2BYTE_KIND:
9986 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9987 case PyUnicode_4BYTE_KIND:
9988 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9989 }
9990 assert(0);
9991 return -1;
9992}
9993
9994static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009995anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9996 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009998 switch (kind) {
9999 case PyUnicode_1BYTE_KIND:
10000 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10001 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10002 else
10003 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10004 case PyUnicode_2BYTE_KIND:
10005 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10006 case PyUnicode_4BYTE_KIND:
10007 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10008 }
10009 assert(0);
10010 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010011}
10012
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010013static void
10014replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10015 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10016{
10017 int kind = PyUnicode_KIND(u);
10018 void *data = PyUnicode_DATA(u);
10019 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10020 if (kind == PyUnicode_1BYTE_KIND) {
10021 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10022 (Py_UCS1 *)data + len,
10023 u1, u2, maxcount);
10024 }
10025 else if (kind == PyUnicode_2BYTE_KIND) {
10026 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10027 (Py_UCS2 *)data + len,
10028 u1, u2, maxcount);
10029 }
10030 else {
10031 assert(kind == PyUnicode_4BYTE_KIND);
10032 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10033 (Py_UCS4 *)data + len,
10034 u1, u2, maxcount);
10035 }
10036}
10037
Alexander Belopolsky40018472011-02-26 01:02:56 +000010038static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039replace(PyObject *self, PyObject *str1,
10040 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 PyObject *u;
10043 char *sbuf = PyUnicode_DATA(self);
10044 char *buf1 = PyUnicode_DATA(str1);
10045 char *buf2 = PyUnicode_DATA(str2);
10046 int srelease = 0, release1 = 0, release2 = 0;
10047 int skind = PyUnicode_KIND(self);
10048 int kind1 = PyUnicode_KIND(str1);
10049 int kind2 = PyUnicode_KIND(str2);
10050 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10051 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10052 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010053 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010054 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055
10056 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010057 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010059 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060
Victor Stinner59de0ee2011-10-07 10:01:28 +020010061 if (str1 == str2)
10062 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063
Victor Stinner49a0a212011-10-12 23:46:10 +020010064 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010065 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10066 if (maxchar < maxchar_str1)
10067 /* substring too wide to be present */
10068 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010069 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10070 /* Replacing str1 with str2 may cause a maxchar reduction in the
10071 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010072 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010073 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010076 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010078 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010080 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010081 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010082 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010083
Victor Stinner69ed0f42013-04-09 21:48:24 +020010084 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010085 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010086 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010087 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010088 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010090 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010092
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010093 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10094 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010095 }
10096 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 int rkind = skind;
10098 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010099 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 if (kind1 < rkind) {
10102 /* widen substring */
10103 buf1 = _PyUnicode_AsKind(str1, rkind);
10104 if (!buf1) goto error;
10105 release1 = 1;
10106 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010107 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010108 if (i < 0)
10109 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 if (rkind > kind2) {
10111 /* widen replacement */
10112 buf2 = _PyUnicode_AsKind(str2, rkind);
10113 if (!buf2) goto error;
10114 release2 = 1;
10115 }
10116 else if (rkind < kind2) {
10117 /* widen self and buf1 */
10118 rkind = kind2;
10119 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010120 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 sbuf = _PyUnicode_AsKind(self, rkind);
10122 if (!sbuf) goto error;
10123 srelease = 1;
10124 buf1 = _PyUnicode_AsKind(str1, rkind);
10125 if (!buf1) goto error;
10126 release1 = 1;
10127 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010128 u = PyUnicode_New(slen, maxchar);
10129 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010131 assert(PyUnicode_KIND(u) == rkind);
10132 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010133
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010134 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010135 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010136 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010138 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010140
10141 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010143 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010144 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010145 if (i == -1)
10146 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010149 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010153 }
10154 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010156 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 int rkind = skind;
10158 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010161 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 buf1 = _PyUnicode_AsKind(str1, rkind);
10163 if (!buf1) goto error;
10164 release1 = 1;
10165 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010166 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010167 if (n == 0)
10168 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010170 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 buf2 = _PyUnicode_AsKind(str2, rkind);
10172 if (!buf2) goto error;
10173 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010176 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 rkind = kind2;
10178 sbuf = _PyUnicode_AsKind(self, rkind);
10179 if (!sbuf) goto error;
10180 srelease = 1;
10181 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010182 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 buf1 = _PyUnicode_AsKind(str1, rkind);
10184 if (!buf1) goto error;
10185 release1 = 1;
10186 }
10187 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10188 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010189 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 PyErr_SetString(PyExc_OverflowError,
10191 "replace string is too long");
10192 goto error;
10193 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010194 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010195 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010196 _Py_INCREF_UNICODE_EMPTY();
10197 if (!unicode_empty)
10198 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010199 u = unicode_empty;
10200 goto done;
10201 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010202 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 PyErr_SetString(PyExc_OverflowError,
10204 "replace string is too long");
10205 goto error;
10206 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010207 u = PyUnicode_New(new_size, maxchar);
10208 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010210 assert(PyUnicode_KIND(u) == rkind);
10211 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 ires = i = 0;
10213 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010214 while (n-- > 0) {
10215 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010217 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010219 if (j == -1)
10220 break;
10221 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010223 memcpy(res + rkind * ires,
10224 sbuf + rkind * i,
10225 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010227 }
10228 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010232 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010238 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010239 memcpy(res + rkind * ires,
10240 sbuf + rkind * i,
10241 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010242 }
10243 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244 /* interleave */
10245 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010248 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010250 if (--n <= 0)
10251 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010252 memcpy(res + rkind * ires,
10253 sbuf + rkind * i,
10254 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 ires++;
10256 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010257 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010258 memcpy(res + rkind * ires,
10259 sbuf + rkind * i,
10260 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010261 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010262 }
10263
10264 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010265 unicode_adjust_maxchar(&u);
10266 if (u == NULL)
10267 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010269
10270 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 if (srelease)
10272 PyMem_FREE(sbuf);
10273 if (release1)
10274 PyMem_FREE(buf1);
10275 if (release2)
10276 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010277 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010281 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (srelease)
10283 PyMem_FREE(sbuf);
10284 if (release1)
10285 PyMem_FREE(buf1);
10286 if (release2)
10287 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010288 return unicode_result_unchanged(self);
10289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 error:
10291 if (srelease && sbuf)
10292 PyMem_FREE(sbuf);
10293 if (release1 && buf1)
10294 PyMem_FREE(buf1);
10295 if (release2 && buf2)
10296 PyMem_FREE(buf2);
10297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298}
10299
10300/* --- Unicode Object Methods --------------------------------------------- */
10301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010302PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010303 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304\n\
10305Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010306characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307
10308static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010309unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010311 if (PyUnicode_READY(self) == -1)
10312 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010313 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314}
10315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010316PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010317 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318\n\
10319Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010320have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321
10322static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010323unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010325 if (PyUnicode_READY(self) == -1)
10326 return NULL;
10327 if (PyUnicode_GET_LENGTH(self) == 0)
10328 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010329 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330}
10331
Benjamin Petersond5890c82012-01-14 13:23:30 -050010332PyDoc_STRVAR(casefold__doc__,
10333 "S.casefold() -> str\n\
10334\n\
10335Return a version of S suitable for caseless comparisons.");
10336
10337static PyObject *
10338unicode_casefold(PyObject *self)
10339{
10340 if (PyUnicode_READY(self) == -1)
10341 return NULL;
10342 if (PyUnicode_IS_ASCII(self))
10343 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010344 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010345}
10346
10347
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010348/* Argument converter. Coerces to a single unicode character */
10349
10350static int
10351convert_uc(PyObject *obj, void *addr)
10352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010354 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010355
Benjamin Peterson14339b62009-01-31 16:36:08 +000010356 uniobj = PyUnicode_FromObject(obj);
10357 if (uniobj == NULL) {
10358 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010359 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010360 return 0;
10361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010363 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010364 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010365 Py_DECREF(uniobj);
10366 return 0;
10367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010369 Py_DECREF(uniobj);
10370 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010371}
10372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010373PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010374 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010376Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010377done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
10379static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010380unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010382 Py_ssize_t marg, left;
10383 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 Py_UCS4 fillchar = ' ';
10385
Victor Stinnere9a29352011-10-01 02:14:59 +020010386 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
Benjamin Petersonbac79492012-01-14 13:34:47 -050010389 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390 return NULL;
10391
Victor Stinnerc4b49542011-12-11 22:44:26 +010010392 if (PyUnicode_GET_LENGTH(self) >= width)
10393 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394
Victor Stinnerc4b49542011-12-11 22:44:26 +010010395 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396 left = marg / 2 + (marg & width & 1);
10397
Victor Stinner9310abb2011-10-05 00:59:23 +020010398 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399}
10400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401/* This function assumes that str1 and str2 are readied by the caller. */
10402
Marc-André Lemburge5034372000-08-08 08:04:29 +000010403static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010404unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010405{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010406#define COMPARE(TYPE1, TYPE2) \
10407 do { \
10408 TYPE1* p1 = (TYPE1 *)data1; \
10409 TYPE2* p2 = (TYPE2 *)data2; \
10410 TYPE1* end = p1 + len; \
10411 Py_UCS4 c1, c2; \
10412 for (; p1 != end; p1++, p2++) { \
10413 c1 = *p1; \
10414 c2 = *p2; \
10415 if (c1 != c2) \
10416 return (c1 < c2) ? -1 : 1; \
10417 } \
10418 } \
10419 while (0)
10420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 int kind1, kind2;
10422 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010423 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010424
Victor Stinner90db9c42012-10-04 21:53:50 +020010425 /* a string is equal to itself */
10426 if (str1 == str2)
10427 return 0;
10428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 kind1 = PyUnicode_KIND(str1);
10430 kind2 = PyUnicode_KIND(str2);
10431 data1 = PyUnicode_DATA(str1);
10432 data2 = PyUnicode_DATA(str2);
10433 len1 = PyUnicode_GET_LENGTH(str1);
10434 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010435 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010436
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010437 switch(kind1) {
10438 case PyUnicode_1BYTE_KIND:
10439 {
10440 switch(kind2) {
10441 case PyUnicode_1BYTE_KIND:
10442 {
10443 int cmp = memcmp(data1, data2, len);
10444 /* normalize result of memcmp() into the range [-1; 1] */
10445 if (cmp < 0)
10446 return -1;
10447 if (cmp > 0)
10448 return 1;
10449 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010450 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010451 case PyUnicode_2BYTE_KIND:
10452 COMPARE(Py_UCS1, Py_UCS2);
10453 break;
10454 case PyUnicode_4BYTE_KIND:
10455 COMPARE(Py_UCS1, Py_UCS4);
10456 break;
10457 default:
10458 assert(0);
10459 }
10460 break;
10461 }
10462 case PyUnicode_2BYTE_KIND:
10463 {
10464 switch(kind2) {
10465 case PyUnicode_1BYTE_KIND:
10466 COMPARE(Py_UCS2, Py_UCS1);
10467 break;
10468 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010469 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010470 COMPARE(Py_UCS2, Py_UCS2);
10471 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010472 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010473 case PyUnicode_4BYTE_KIND:
10474 COMPARE(Py_UCS2, Py_UCS4);
10475 break;
10476 default:
10477 assert(0);
10478 }
10479 break;
10480 }
10481 case PyUnicode_4BYTE_KIND:
10482 {
10483 switch(kind2) {
10484 case PyUnicode_1BYTE_KIND:
10485 COMPARE(Py_UCS4, Py_UCS1);
10486 break;
10487 case PyUnicode_2BYTE_KIND:
10488 COMPARE(Py_UCS4, Py_UCS2);
10489 break;
10490 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010491 {
10492#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10493 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10494 /* normalize result of wmemcmp() into the range [-1; 1] */
10495 if (cmp < 0)
10496 return -1;
10497 if (cmp > 0)
10498 return 1;
10499#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010500 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010501#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010502 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010503 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010504 default:
10505 assert(0);
10506 }
10507 break;
10508 }
10509 default:
10510 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010511 }
10512
Victor Stinner770e19e2012-10-04 22:59:45 +020010513 if (len1 == len2)
10514 return 0;
10515 if (len1 < len2)
10516 return -1;
10517 else
10518 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010519
10520#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010521}
10522
Victor Stinnere5567ad2012-10-23 02:48:49 +020010523static int
10524unicode_compare_eq(PyObject *str1, PyObject *str2)
10525{
10526 int kind;
10527 void *data1, *data2;
10528 Py_ssize_t len;
10529 int cmp;
10530
10531 /* a string is equal to itself */
10532 if (str1 == str2)
10533 return 1;
10534
10535 len = PyUnicode_GET_LENGTH(str1);
10536 if (PyUnicode_GET_LENGTH(str2) != len)
10537 return 0;
10538 kind = PyUnicode_KIND(str1);
10539 if (PyUnicode_KIND(str2) != kind)
10540 return 0;
10541 data1 = PyUnicode_DATA(str1);
10542 data2 = PyUnicode_DATA(str2);
10543
10544 cmp = memcmp(data1, data2, len * kind);
10545 return (cmp == 0);
10546}
10547
10548
Alexander Belopolsky40018472011-02-26 01:02:56 +000010549int
10550PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10553 if (PyUnicode_READY(left) == -1 ||
10554 PyUnicode_READY(right) == -1)
10555 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010556 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010558 PyErr_Format(PyExc_TypeError,
10559 "Can't compare %.100s and %.100s",
10560 left->ob_type->tp_name,
10561 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562 return -1;
10563}
10564
Martin v. Löwis5b222132007-06-10 09:51:05 +000010565int
10566PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 Py_ssize_t i;
10569 int kind;
10570 void *data;
10571 Py_UCS4 chr;
10572
Victor Stinner910337b2011-10-03 03:20:16 +020010573 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 if (PyUnicode_READY(uni) == -1)
10575 return -1;
10576 kind = PyUnicode_KIND(uni);
10577 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010578 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10580 if (chr != str[i])
10581 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010582 /* This check keeps Python strings that end in '\0' from comparing equal
10583 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010586 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010587 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010588 return 0;
10589}
10590
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010591
Benjamin Peterson29060642009-01-31 22:14:21 +000010592#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010593 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010594
Alexander Belopolsky40018472011-02-26 01:02:56 +000010595PyObject *
10596PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010597{
10598 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010599 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010600
Victor Stinnere5567ad2012-10-23 02:48:49 +020010601 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10602 Py_RETURN_NOTIMPLEMENTED;
10603
10604 if (PyUnicode_READY(left) == -1 ||
10605 PyUnicode_READY(right) == -1)
10606 return NULL;
10607
10608 if (op == Py_EQ || op == Py_NE) {
10609 result = unicode_compare_eq(left, right);
10610 if (op == Py_EQ)
10611 v = TEST_COND(result);
10612 else
10613 v = TEST_COND(!result);
10614 }
10615 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010616 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010617
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010618 /* Convert the return value to a Boolean */
10619 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010620 case Py_LE:
10621 v = TEST_COND(result <= 0);
10622 break;
10623 case Py_GE:
10624 v = TEST_COND(result >= 0);
10625 break;
10626 case Py_LT:
10627 v = TEST_COND(result == -1);
10628 break;
10629 case Py_GT:
10630 v = TEST_COND(result == 1);
10631 break;
10632 default:
10633 PyErr_BadArgument();
10634 return NULL;
10635 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010636 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010637 Py_INCREF(v);
10638 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010639}
10640
Alexander Belopolsky40018472011-02-26 01:02:56 +000010641int
10642PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010643{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010645 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 void *buf1, *buf2;
10647 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010648 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010649
10650 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651 sub = PyUnicode_FromObject(element);
10652 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010653 PyErr_Format(PyExc_TypeError,
10654 "'in <string>' requires string as left operand, not %s",
10655 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010657 }
10658
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010660 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 Py_DECREF(sub);
10662 return -1;
10663 }
10664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 kind1 = PyUnicode_KIND(str);
10666 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 buf1 = PyUnicode_DATA(str);
10668 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010669 if (kind2 != kind1) {
10670 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010671 Py_DECREF(sub);
10672 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010673 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010674 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010675 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (!buf2) {
10678 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010679 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 return -1;
10681 }
10682 len1 = PyUnicode_GET_LENGTH(str);
10683 len2 = PyUnicode_GET_LENGTH(sub);
10684
Victor Stinner77282cb2013-04-14 19:22:47 +020010685 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 case PyUnicode_1BYTE_KIND:
10687 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10688 break;
10689 case PyUnicode_2BYTE_KIND:
10690 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10691 break;
10692 case PyUnicode_4BYTE_KIND:
10693 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10694 break;
10695 default:
10696 result = -1;
10697 assert(0);
10698 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699
10700 Py_DECREF(str);
10701 Py_DECREF(sub);
10702
Victor Stinner77282cb2013-04-14 19:22:47 +020010703 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 PyMem_Free(buf2);
10705
Guido van Rossum403d68b2000-03-13 15:55:09 +000010706 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010707}
10708
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709/* Concat to string or Unicode object giving a new Unicode object. */
10710
Alexander Belopolsky40018472011-02-26 01:02:56 +000010711PyObject *
10712PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010715 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010716 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
10718 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725
10726 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010727 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010728 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010731 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010732 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 }
10735
Victor Stinner488fa492011-12-12 00:01:39 +010010736 u_len = PyUnicode_GET_LENGTH(u);
10737 v_len = PyUnicode_GET_LENGTH(v);
10738 if (u_len > PY_SSIZE_T_MAX - v_len) {
10739 PyErr_SetString(PyExc_OverflowError,
10740 "strings are too large to concat");
10741 goto onError;
10742 }
10743 new_len = u_len + v_len;
10744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010746 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010747 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010750 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010752 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010753 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10754 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755 Py_DECREF(u);
10756 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010757 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761 Py_XDECREF(u);
10762 Py_XDECREF(v);
10763 return NULL;
10764}
10765
Walter Dörwald1ab83302007-05-18 17:15:44 +000010766void
Victor Stinner23e56682011-10-03 03:54:37 +020010767PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010768{
Victor Stinner23e56682011-10-03 03:54:37 +020010769 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010770 Py_UCS4 maxchar, maxchar2;
10771 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010772
10773 if (p_left == NULL) {
10774 if (!PyErr_Occurred())
10775 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010776 return;
10777 }
Victor Stinner23e56682011-10-03 03:54:37 +020010778 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020010779 if (right == NULL || left == NULL
10780 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010781 if (!PyErr_Occurred())
10782 PyErr_BadInternalCall();
10783 goto error;
10784 }
10785
Benjamin Petersonbac79492012-01-14 13:34:47 -050010786 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010787 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010788 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010789 goto error;
10790
Victor Stinner488fa492011-12-12 00:01:39 +010010791 /* Shortcuts */
10792 if (left == unicode_empty) {
10793 Py_DECREF(left);
10794 Py_INCREF(right);
10795 *p_left = right;
10796 return;
10797 }
10798 if (right == unicode_empty)
10799 return;
10800
10801 left_len = PyUnicode_GET_LENGTH(left);
10802 right_len = PyUnicode_GET_LENGTH(right);
10803 if (left_len > PY_SSIZE_T_MAX - right_len) {
10804 PyErr_SetString(PyExc_OverflowError,
10805 "strings are too large to concat");
10806 goto error;
10807 }
10808 new_len = left_len + right_len;
10809
10810 if (unicode_modifiable(left)
10811 && PyUnicode_CheckExact(right)
10812 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010813 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10814 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010815 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010816 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010817 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10818 {
10819 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010820 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010010821 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020010822
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010823 /* copy 'right' into the newly allocated area of 'left' */
10824 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010825 }
Victor Stinner488fa492011-12-12 00:01:39 +010010826 else {
10827 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10828 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010829 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010830
Victor Stinner488fa492011-12-12 00:01:39 +010010831 /* Concat the two Unicode strings */
10832 res = PyUnicode_New(new_len, maxchar);
10833 if (res == NULL)
10834 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010835 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10836 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010837 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010838 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010010839 }
10840 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010841 return;
10842
10843error:
Victor Stinner488fa492011-12-12 00:01:39 +010010844 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010845}
10846
10847void
10848PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10849{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010850 PyUnicode_Append(pleft, right);
10851 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010852}
10853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010854PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010855 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010857Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010858string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010859interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860
10861static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010862unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010864 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010865 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010866 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 int kind1, kind2, kind;
10869 void *buf1, *buf2;
10870 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871
Jesus Ceaac451502011-04-20 17:09:23 +020010872 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10873 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 kind1 = PyUnicode_KIND(self);
10877 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020010878 if (kind2 > kind1) {
10879 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010880 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020010881 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010882 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 buf1 = PyUnicode_DATA(self);
10884 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010885 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010886 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 if (!buf2) {
10888 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 return NULL;
10890 }
10891 len1 = PyUnicode_GET_LENGTH(self);
10892 len2 = PyUnicode_GET_LENGTH(substring);
10893
10894 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010895 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 case PyUnicode_1BYTE_KIND:
10897 iresult = ucs1lib_count(
10898 ((Py_UCS1*)buf1) + start, end - start,
10899 buf2, len2, PY_SSIZE_T_MAX
10900 );
10901 break;
10902 case PyUnicode_2BYTE_KIND:
10903 iresult = ucs2lib_count(
10904 ((Py_UCS2*)buf1) + start, end - start,
10905 buf2, len2, PY_SSIZE_T_MAX
10906 );
10907 break;
10908 case PyUnicode_4BYTE_KIND:
10909 iresult = ucs4lib_count(
10910 ((Py_UCS4*)buf1) + start, end - start,
10911 buf2, len2, PY_SSIZE_T_MAX
10912 );
10913 break;
10914 default:
10915 assert(0); iresult = 0;
10916 }
10917
10918 result = PyLong_FromSsize_t(iresult);
10919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920 if (kind2 != kind)
10921 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922
10923 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925 return result;
10926}
10927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010928PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010929 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010931Encode S using the codec registered for encoding. Default encoding\n\
10932is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010933handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010934a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10935'xmlcharrefreplace' as well as any other name registered with\n\
10936codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
10938static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010939unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010941 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942 char *encoding = NULL;
10943 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010944
Benjamin Peterson308d6372009-09-18 21:42:35 +000010945 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10946 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010948 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010949}
10950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010951PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010952 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953\n\
10954Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010955If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
10957static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010958unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010960 Py_ssize_t i, j, line_pos, src_len, incr;
10961 Py_UCS4 ch;
10962 PyObject *u;
10963 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010965 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010966 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
10968 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970
Antoine Pitrou22425222011-10-04 19:10:51 +020010971 if (PyUnicode_READY(self) == -1)
10972 return NULL;
10973
Thomas Wouters7e474022000-07-16 12:04:32 +000010974 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010975 src_len = PyUnicode_GET_LENGTH(self);
10976 i = j = line_pos = 0;
10977 kind = PyUnicode_KIND(self);
10978 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010979 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010980 for (; i < src_len; i++) {
10981 ch = PyUnicode_READ(kind, src_data, i);
10982 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010983 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010984 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010985 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010987 goto overflow;
10988 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010989 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010990 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010994 goto overflow;
10995 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010997 if (ch == '\n' || ch == '\r')
10998 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011000 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011001 if (!found)
11002 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011003
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011005 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 if (!u)
11007 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011008 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009
Antoine Pitroue71d5742011-10-04 15:55:09 +020011010 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011
Antoine Pitroue71d5742011-10-04 15:55:09 +020011012 for (; i < src_len; i++) {
11013 ch = PyUnicode_READ(kind, src_data, i);
11014 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011015 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011016 incr = tabsize - (line_pos % tabsize);
11017 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011018 FILL(kind, dest_data, ' ', j, incr);
11019 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011021 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011023 line_pos++;
11024 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011025 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011026 if (ch == '\n' || ch == '\r')
11027 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011029 }
11030 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011031 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011032
Antoine Pitroue71d5742011-10-04 15:55:09 +020011033 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011034 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036}
11037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011038PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011039 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040\n\
11041Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011042such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043arguments start and end are interpreted as in slice notation.\n\
11044\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011045Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046
11047static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011050 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011051 Py_ssize_t start;
11052 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011053 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054
Jesus Ceaac451502011-04-20 17:09:23 +020011055 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11056 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058
Christian Heimesd47802e2013-06-29 21:33:36 +020011059 if (PyUnicode_READY(self) == -1) {
11060 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011062 }
11063 if (PyUnicode_READY(substring) == -1) {
11064 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067
Victor Stinner7931d9a2011-11-04 00:22:48 +010011068 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
11070 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 if (result == -2)
11073 return NULL;
11074
Christian Heimes217cfd12007-12-02 14:31:20 +000011075 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076}
11077
11078static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011079unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011081 void *data;
11082 enum PyUnicode_Kind kind;
11083 Py_UCS4 ch;
11084 PyObject *res;
11085
11086 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11087 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011089 }
11090 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11091 PyErr_SetString(PyExc_IndexError, "string index out of range");
11092 return NULL;
11093 }
11094 kind = PyUnicode_KIND(self);
11095 data = PyUnicode_DATA(self);
11096 ch = PyUnicode_READ(kind, data, index);
11097 if (ch < 256)
11098 return get_latin1_char(ch);
11099
11100 res = PyUnicode_New(1, ch);
11101 if (res == NULL)
11102 return NULL;
11103 kind = PyUnicode_KIND(res);
11104 data = PyUnicode_DATA(res);
11105 PyUnicode_WRITE(kind, data, 0, ch);
11106 assert(_PyUnicode_CheckConsistency(res, 1));
11107 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108}
11109
Guido van Rossumc2504932007-09-18 19:42:40 +000011110/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011111 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011112static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011113unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114{
Guido van Rossumc2504932007-09-18 19:42:40 +000011115 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011116 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011117
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011118#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011119 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011120#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 if (_PyUnicode_HASH(self) != -1)
11122 return _PyUnicode_HASH(self);
11123 if (PyUnicode_READY(self) == -1)
11124 return -1;
11125 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011126 /*
11127 We make the hash of the empty string be 0, rather than using
11128 (prefix ^ suffix), since this slightly obfuscates the hash secret
11129 */
11130 if (len == 0) {
11131 _PyUnicode_HASH(self) = 0;
11132 return 0;
11133 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134
11135 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011136#define HASH(P) \
11137 x ^= (Py_uhash_t) *P << 7; \
11138 while (--len >= 0) \
11139 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140
Georg Brandl2fb477c2012-02-21 00:33:36 +010011141 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 switch (PyUnicode_KIND(self)) {
11143 case PyUnicode_1BYTE_KIND: {
11144 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11145 HASH(c);
11146 break;
11147 }
11148 case PyUnicode_2BYTE_KIND: {
11149 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11150 HASH(s);
11151 break;
11152 }
11153 default: {
11154 Py_UCS4 *l;
11155 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11156 "Impossible switch case in unicode_hash");
11157 l = PyUnicode_4BYTE_DATA(self);
11158 HASH(l);
11159 break;
11160 }
11161 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011162 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11163 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164
Guido van Rossumc2504932007-09-18 19:42:40 +000011165 if (x == -1)
11166 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011168 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011172PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011173 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011175Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176
11177static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011180 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011181 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011182 Py_ssize_t start;
11183 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
Jesus Ceaac451502011-04-20 17:09:23 +020011185 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11186 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
Christian Heimesd47a0452013-06-29 21:21:37 +020011189 if (PyUnicode_READY(self) == -1) {
11190 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011192 }
11193 if (PyUnicode_READY(substring) == -1) {
11194 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197
Victor Stinner7931d9a2011-11-04 00:22:48 +010011198 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
11200 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 if (result == -2)
11203 return NULL;
11204
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 if (result < 0) {
11206 PyErr_SetString(PyExc_ValueError, "substring not found");
11207 return NULL;
11208 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011209
Christian Heimes217cfd12007-12-02 14:31:20 +000011210 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211}
11212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011213PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011216Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011217at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218
11219static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011220unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 Py_ssize_t i, length;
11223 int kind;
11224 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225 int cased;
11226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 if (PyUnicode_READY(self) == -1)
11228 return NULL;
11229 length = PyUnicode_GET_LENGTH(self);
11230 kind = PyUnicode_KIND(self);
11231 data = PyUnicode_DATA(self);
11232
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 if (length == 1)
11235 return PyBool_FromLong(
11236 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011238 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011241
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011243 for (i = 0; i < length; i++) {
11244 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011245
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11247 return PyBool_FromLong(0);
11248 else if (!cased && Py_UNICODE_ISLOWER(ch))
11249 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011251 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252}
11253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011254PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011257Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011258at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
11260static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011261unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 Py_ssize_t i, length;
11264 int kind;
11265 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 int cased;
11267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 if (PyUnicode_READY(self) == -1)
11269 return NULL;
11270 length = PyUnicode_GET_LENGTH(self);
11271 kind = PyUnicode_KIND(self);
11272 data = PyUnicode_DATA(self);
11273
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 if (length == 1)
11276 return PyBool_FromLong(
11277 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011279 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011282
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 for (i = 0; i < length; i++) {
11285 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011286
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11288 return PyBool_FromLong(0);
11289 else if (!cased && Py_UNICODE_ISUPPER(ch))
11290 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011292 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293}
11294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011295PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011298Return True if S is a titlecased string and there is at least one\n\
11299character in S, i.e. upper- and titlecase characters may only\n\
11300follow uncased characters and lowercase characters only cased ones.\n\
11301Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
11303static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011304unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 Py_ssize_t i, length;
11307 int kind;
11308 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 int cased, previous_is_cased;
11310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 if (PyUnicode_READY(self) == -1)
11312 return NULL;
11313 length = PyUnicode_GET_LENGTH(self);
11314 kind = PyUnicode_KIND(self);
11315 data = PyUnicode_DATA(self);
11316
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 if (length == 1) {
11319 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11320 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11321 (Py_UNICODE_ISUPPER(ch) != 0));
11322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011324 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011327
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328 cased = 0;
11329 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 for (i = 0; i < length; i++) {
11331 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011332
Benjamin Peterson29060642009-01-31 22:14:21 +000011333 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11334 if (previous_is_cased)
11335 return PyBool_FromLong(0);
11336 previous_is_cased = 1;
11337 cased = 1;
11338 }
11339 else if (Py_UNICODE_ISLOWER(ch)) {
11340 if (!previous_is_cased)
11341 return PyBool_FromLong(0);
11342 previous_is_cased = 1;
11343 cased = 1;
11344 }
11345 else
11346 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011348 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349}
11350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011351PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011352 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011354Return True if all characters in S are whitespace\n\
11355and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356
11357static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011358unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 Py_ssize_t i, length;
11361 int kind;
11362 void *data;
11363
11364 if (PyUnicode_READY(self) == -1)
11365 return NULL;
11366 length = PyUnicode_GET_LENGTH(self);
11367 kind = PyUnicode_KIND(self);
11368 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 if (length == 1)
11372 return PyBool_FromLong(
11373 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011375 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011377 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 for (i = 0; i < length; i++) {
11380 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011381 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011382 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011384 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385}
11386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011387PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011390Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011391and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011392
11393static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011394unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011395{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 Py_ssize_t i, length;
11397 int kind;
11398 void *data;
11399
11400 if (PyUnicode_READY(self) == -1)
11401 return NULL;
11402 length = PyUnicode_GET_LENGTH(self);
11403 kind = PyUnicode_KIND(self);
11404 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011405
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011406 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 if (length == 1)
11408 return PyBool_FromLong(
11409 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011410
11411 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011413 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 for (i = 0; i < length; i++) {
11416 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011419 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011420}
11421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011422PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011424\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011425Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011427
11428static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011429unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 int kind;
11432 void *data;
11433 Py_ssize_t len, i;
11434
11435 if (PyUnicode_READY(self) == -1)
11436 return NULL;
11437
11438 kind = PyUnicode_KIND(self);
11439 data = PyUnicode_DATA(self);
11440 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011441
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011442 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 if (len == 1) {
11444 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11445 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11446 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011447
11448 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 for (i = 0; i < len; i++) {
11453 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011454 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011456 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011457 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011458}
11459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011463Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011464False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465
11466static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011467unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 Py_ssize_t i, length;
11470 int kind;
11471 void *data;
11472
11473 if (PyUnicode_READY(self) == -1)
11474 return NULL;
11475 length = PyUnicode_GET_LENGTH(self);
11476 kind = PyUnicode_KIND(self);
11477 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (length == 1)
11481 return PyBool_FromLong(
11482 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011484 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 for (i = 0; i < length; i++) {
11489 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011492 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493}
11494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011495PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011496 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011498Return True if all characters in S are digits\n\
11499and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
11501static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011502unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 Py_ssize_t i, length;
11505 int kind;
11506 void *data;
11507
11508 if (PyUnicode_READY(self) == -1)
11509 return NULL;
11510 length = PyUnicode_GET_LENGTH(self);
11511 kind = PyUnicode_KIND(self);
11512 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 if (length == 1) {
11516 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11517 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011520 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 for (i = 0; i < length; i++) {
11525 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011528 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529}
11530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011531PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011534Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011535False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
11537static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011538unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 Py_ssize_t i, length;
11541 int kind;
11542 void *data;
11543
11544 if (PyUnicode_READY(self) == -1)
11545 return NULL;
11546 length = PyUnicode_GET_LENGTH(self);
11547 kind = PyUnicode_KIND(self);
11548 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (length == 1)
11552 return PyBool_FromLong(
11553 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011555 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 for (i = 0; i < length; i++) {
11560 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011563 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564}
11565
Martin v. Löwis47383402007-08-15 07:32:56 +000011566int
11567PyUnicode_IsIdentifier(PyObject *self)
11568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 int kind;
11570 void *data;
11571 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011572 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 if (PyUnicode_READY(self) == -1) {
11575 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 }
11578
11579 /* Special case for empty strings */
11580 if (PyUnicode_GET_LENGTH(self) == 0)
11581 return 0;
11582 kind = PyUnicode_KIND(self);
11583 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011584
11585 /* PEP 3131 says that the first character must be in
11586 XID_Start and subsequent characters in XID_Continue,
11587 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011588 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011589 letters, digits, underscore). However, given the current
11590 definition of XID_Start and XID_Continue, it is sufficient
11591 to check just for these, except that _ must be allowed
11592 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011594 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011595 return 0;
11596
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011597 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011600 return 1;
11601}
11602
11603PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011605\n\
11606Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011607to the language definition.\n\
11608\n\
11609Use keyword.iskeyword() to test for reserved identifiers\n\
11610such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011611
11612static PyObject*
11613unicode_isidentifier(PyObject *self)
11614{
11615 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11616}
11617
Georg Brandl559e5d72008-06-11 18:37:52 +000011618PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011620\n\
11621Return True if all characters in S are considered\n\
11622printable in repr() or S is empty, False otherwise.");
11623
11624static PyObject*
11625unicode_isprintable(PyObject *self)
11626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 Py_ssize_t i, length;
11628 int kind;
11629 void *data;
11630
11631 if (PyUnicode_READY(self) == -1)
11632 return NULL;
11633 length = PyUnicode_GET_LENGTH(self);
11634 kind = PyUnicode_KIND(self);
11635 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011636
11637 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 if (length == 1)
11639 return PyBool_FromLong(
11640 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 for (i = 0; i < length; i++) {
11643 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011644 Py_RETURN_FALSE;
11645 }
11646 }
11647 Py_RETURN_TRUE;
11648}
11649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011650PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011651 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652\n\
11653Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011654iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655
11656static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011657unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011659 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660}
11661
Martin v. Löwis18e16552006-02-15 17:27:45 +000011662static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011663unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 if (PyUnicode_READY(self) == -1)
11666 return -1;
11667 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668}
11669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011670PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011673Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011674done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675
11676static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011677unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011679 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 Py_UCS4 fillchar = ' ';
11681
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011682 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683 return NULL;
11684
Benjamin Petersonbac79492012-01-14 13:34:47 -050011685 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011686 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687
Victor Stinnerc4b49542011-12-11 22:44:26 +010011688 if (PyUnicode_GET_LENGTH(self) >= width)
11689 return unicode_result_unchanged(self);
11690
11691 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692}
11693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011694PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011695 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011697Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698
11699static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011700unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011702 if (PyUnicode_READY(self) == -1)
11703 return NULL;
11704 if (PyUnicode_IS_ASCII(self))
11705 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011706 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707}
11708
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011709#define LEFTSTRIP 0
11710#define RIGHTSTRIP 1
11711#define BOTHSTRIP 2
11712
11713/* Arrays indexed by above */
11714static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11715
11716#define STRIPNAME(i) (stripformat[i]+3)
11717
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011718/* externally visible for str.strip(unicode) */
11719PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011720_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 void *data;
11723 int kind;
11724 Py_ssize_t i, j, len;
11725 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011726 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11729 return NULL;
11730
11731 kind = PyUnicode_KIND(self);
11732 data = PyUnicode_DATA(self);
11733 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011734 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11736 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011737 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011738
Benjamin Peterson14339b62009-01-31 16:36:08 +000011739 i = 0;
11740 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011741 while (i < len) {
11742 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11743 if (!BLOOM(sepmask, ch))
11744 break;
11745 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11746 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 i++;
11748 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011749 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750
Benjamin Peterson14339b62009-01-31 16:36:08 +000011751 j = len;
11752 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011753 j--;
11754 while (j >= i) {
11755 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11756 if (!BLOOM(sepmask, ch))
11757 break;
11758 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11759 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011761 }
11762
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011764 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011765
Victor Stinner7931d9a2011-11-04 00:22:48 +010011766 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767}
11768
11769PyObject*
11770PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11771{
11772 unsigned char *data;
11773 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011774 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775
Victor Stinnerde636f32011-10-01 03:55:54 +020011776 if (PyUnicode_READY(self) == -1)
11777 return NULL;
11778
Victor Stinner684d5fd2012-05-03 02:32:34 +020011779 length = PyUnicode_GET_LENGTH(self);
11780 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011781
Victor Stinner684d5fd2012-05-03 02:32:34 +020011782 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011783 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784
Victor Stinnerde636f32011-10-01 03:55:54 +020011785 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011786 PyErr_SetString(PyExc_IndexError, "string index out of range");
11787 return NULL;
11788 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011789 if (start >= length || end < start)
11790 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011791
Victor Stinner684d5fd2012-05-03 02:32:34 +020011792 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011793 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011794 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011795 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011796 }
11797 else {
11798 kind = PyUnicode_KIND(self);
11799 data = PyUnicode_1BYTE_DATA(self);
11800 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011801 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011802 length);
11803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
11806static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011807do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 Py_ssize_t len, i, j;
11810
11811 if (PyUnicode_READY(self) == -1)
11812 return NULL;
11813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815
Victor Stinnercc7af722013-04-09 22:39:24 +020011816 if (PyUnicode_IS_ASCII(self)) {
11817 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11818
11819 i = 0;
11820 if (striptype != RIGHTSTRIP) {
11821 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011822 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020011823 if (!_Py_ascii_whitespace[ch])
11824 break;
11825 i++;
11826 }
11827 }
11828
11829 j = len;
11830 if (striptype != LEFTSTRIP) {
11831 j--;
11832 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011833 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020011834 if (!_Py_ascii_whitespace[ch])
11835 break;
11836 j--;
11837 }
11838 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011839 }
11840 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011841 else {
11842 int kind = PyUnicode_KIND(self);
11843 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011844
Victor Stinnercc7af722013-04-09 22:39:24 +020011845 i = 0;
11846 if (striptype != RIGHTSTRIP) {
11847 while (i < len) {
11848 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11849 if (!Py_UNICODE_ISSPACE(ch))
11850 break;
11851 i++;
11852 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011853 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011854
11855 j = len;
11856 if (striptype != LEFTSTRIP) {
11857 j--;
11858 while (j >= i) {
11859 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11860 if (!Py_UNICODE_ISSPACE(ch))
11861 break;
11862 j--;
11863 }
11864 j++;
11865 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011866 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011867
Victor Stinner7931d9a2011-11-04 00:22:48 +010011868 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869}
11870
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011871
11872static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011873do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011874{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011875 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011876
Serhiy Storchakac6792272013-10-19 21:03:34 +030011877 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011878 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011879
Benjamin Peterson14339b62009-01-31 16:36:08 +000011880 if (sep != NULL && sep != Py_None) {
11881 if (PyUnicode_Check(sep))
11882 return _PyUnicode_XStrip(self, striptype, sep);
11883 else {
11884 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011885 "%s arg must be None or str",
11886 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011887 return NULL;
11888 }
11889 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011890
Benjamin Peterson14339b62009-01-31 16:36:08 +000011891 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011892}
11893
11894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011895PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011897\n\
11898Return a copy of the string S with leading and trailing\n\
11899whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011900If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011901
11902static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011903unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011904{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011905 if (PyTuple_GET_SIZE(args) == 0)
11906 return do_strip(self, BOTHSTRIP); /* Common case */
11907 else
11908 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011909}
11910
11911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011912PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011914\n\
11915Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011916If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011917
11918static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011919unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011920{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011921 if (PyTuple_GET_SIZE(args) == 0)
11922 return do_strip(self, LEFTSTRIP); /* Common case */
11923 else
11924 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011925}
11926
11927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011928PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011930\n\
11931Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011932If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011933
11934static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011935unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011936{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011937 if (PyTuple_GET_SIZE(args) == 0)
11938 return do_strip(self, RIGHTSTRIP); /* Common case */
11939 else
11940 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011941}
11942
11943
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011945unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011947 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949
Serhiy Storchaka05997252013-01-26 12:14:02 +020011950 if (len < 1)
11951 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
Victor Stinnerc4b49542011-12-11 22:44:26 +010011953 /* no repeat, return original string */
11954 if (len == 1)
11955 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011956
Benjamin Petersonbac79492012-01-14 13:34:47 -050011957 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 return NULL;
11959
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011960 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011961 PyErr_SetString(PyExc_OverflowError,
11962 "repeated string is too long");
11963 return NULL;
11964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011966
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011967 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 if (!u)
11969 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011970 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (PyUnicode_GET_LENGTH(str) == 1) {
11973 const int kind = PyUnicode_KIND(str);
11974 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011975 if (kind == PyUnicode_1BYTE_KIND) {
11976 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011977 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011978 }
11979 else if (kind == PyUnicode_2BYTE_KIND) {
11980 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011981 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011982 ucs2[n] = fill_char;
11983 } else {
11984 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11985 assert(kind == PyUnicode_4BYTE_KIND);
11986 for (n = 0; n < len; ++n)
11987 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 }
11990 else {
11991 /* number of characters copied this far */
11992 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011993 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 char *to = (char *) PyUnicode_DATA(u);
11995 Py_MEMCPY(to, PyUnicode_DATA(str),
11996 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 n = (done <= nchars-done) ? done : nchars-done;
11999 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012000 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 }
12003
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012004 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012005 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006}
12007
Alexander Belopolsky40018472011-02-26 01:02:56 +000012008PyObject *
12009PyUnicode_Replace(PyObject *obj,
12010 PyObject *subobj,
12011 PyObject *replobj,
12012 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013{
12014 PyObject *self;
12015 PyObject *str1;
12016 PyObject *str2;
12017 PyObject *result;
12018
12019 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012020 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012023 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 Py_DECREF(self);
12025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026 }
12027 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012028 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012029 Py_DECREF(self);
12030 Py_DECREF(str1);
12031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012033 if (PyUnicode_READY(self) == -1 ||
12034 PyUnicode_READY(str1) == -1 ||
12035 PyUnicode_READY(str2) == -1)
12036 result = NULL;
12037 else
12038 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039 Py_DECREF(self);
12040 Py_DECREF(str1);
12041 Py_DECREF(str2);
12042 return result;
12043}
12044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012045PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012046 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047\n\
12048Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012049old replaced by new. If the optional argument count is\n\
12050given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
12052static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 PyObject *str1;
12056 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012057 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 PyObject *result;
12059
Martin v. Löwis18e16552006-02-15 17:27:45 +000012060 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012062 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012063 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012065 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 return NULL;
12067 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012068 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 Py_DECREF(str1);
12070 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012071 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012072 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12073 result = NULL;
12074 else
12075 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076
12077 Py_DECREF(str1);
12078 Py_DECREF(str2);
12079 return result;
12080}
12081
Alexander Belopolsky40018472011-02-26 01:02:56 +000012082static PyObject *
12083unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012085 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 Py_ssize_t isize;
12087 Py_ssize_t osize, squote, dquote, i, o;
12088 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012089 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012093 return NULL;
12094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 isize = PyUnicode_GET_LENGTH(unicode);
12096 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 /* Compute length of output, quote characters, and
12099 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012100 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 max = 127;
12102 squote = dquote = 0;
12103 ikind = PyUnicode_KIND(unicode);
12104 for (i = 0; i < isize; i++) {
12105 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12106 switch (ch) {
12107 case '\'': squote++; osize++; break;
12108 case '"': dquote++; osize++; break;
12109 case '\\': case '\t': case '\r': case '\n':
12110 osize += 2; break;
12111 default:
12112 /* Fast-path ASCII */
12113 if (ch < ' ' || ch == 0x7f)
12114 osize += 4; /* \xHH */
12115 else if (ch < 0x7f)
12116 osize++;
12117 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12118 osize++;
12119 max = ch > max ? ch : max;
12120 }
12121 else if (ch < 0x100)
12122 osize += 4; /* \xHH */
12123 else if (ch < 0x10000)
12124 osize += 6; /* \uHHHH */
12125 else
12126 osize += 10; /* \uHHHHHHHH */
12127 }
12128 }
12129
12130 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012131 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012133 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (dquote)
12135 /* Both squote and dquote present. Use squote,
12136 and escape them */
12137 osize += squote;
12138 else
12139 quote = '"';
12140 }
Victor Stinner55c08782013-04-14 18:45:39 +020012141 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142
12143 repr = PyUnicode_New(osize, max);
12144 if (repr == NULL)
12145 return NULL;
12146 okind = PyUnicode_KIND(repr);
12147 odata = PyUnicode_DATA(repr);
12148
12149 PyUnicode_WRITE(okind, odata, 0, quote);
12150 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012151 if (unchanged) {
12152 _PyUnicode_FastCopyCharacters(repr, 1,
12153 unicode, 0,
12154 isize);
12155 }
12156 else {
12157 for (i = 0, o = 1; i < isize; i++) {
12158 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159
Victor Stinner55c08782013-04-14 18:45:39 +020012160 /* Escape quotes and backslashes */
12161 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012162 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012164 continue;
12165 }
12166
12167 /* Map special whitespace to '\t', \n', '\r' */
12168 if (ch == '\t') {
12169 PyUnicode_WRITE(okind, odata, o++, '\\');
12170 PyUnicode_WRITE(okind, odata, o++, 't');
12171 }
12172 else if (ch == '\n') {
12173 PyUnicode_WRITE(okind, odata, o++, '\\');
12174 PyUnicode_WRITE(okind, odata, o++, 'n');
12175 }
12176 else if (ch == '\r') {
12177 PyUnicode_WRITE(okind, odata, o++, '\\');
12178 PyUnicode_WRITE(okind, odata, o++, 'r');
12179 }
12180
12181 /* Map non-printable US ASCII to '\xhh' */
12182 else if (ch < ' ' || ch == 0x7F) {
12183 PyUnicode_WRITE(okind, odata, o++, '\\');
12184 PyUnicode_WRITE(okind, odata, o++, 'x');
12185 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12186 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12187 }
12188
12189 /* Copy ASCII characters as-is */
12190 else if (ch < 0x7F) {
12191 PyUnicode_WRITE(okind, odata, o++, ch);
12192 }
12193
12194 /* Non-ASCII characters */
12195 else {
12196 /* Map Unicode whitespace and control characters
12197 (categories Z* and C* except ASCII space)
12198 */
12199 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12200 PyUnicode_WRITE(okind, odata, o++, '\\');
12201 /* Map 8-bit characters to '\xhh' */
12202 if (ch <= 0xff) {
12203 PyUnicode_WRITE(okind, odata, o++, 'x');
12204 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12205 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12206 }
12207 /* Map 16-bit characters to '\uxxxx' */
12208 else if (ch <= 0xffff) {
12209 PyUnicode_WRITE(okind, odata, o++, 'u');
12210 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12211 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12212 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12213 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12214 }
12215 /* Map 21-bit characters to '\U00xxxxxx' */
12216 else {
12217 PyUnicode_WRITE(okind, odata, o++, 'U');
12218 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12219 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12220 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12221 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12222 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12223 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12224 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12225 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12226 }
12227 }
12228 /* Copy characters as-is */
12229 else {
12230 PyUnicode_WRITE(okind, odata, o++, ch);
12231 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012232 }
12233 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012236 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012237 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238}
12239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012240PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012241 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242\n\
12243Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012244such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245arguments start and end are interpreted as in slice notation.\n\
12246\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012247Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248
12249static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012252 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012253 Py_ssize_t start;
12254 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256
Jesus Ceaac451502011-04-20 17:09:23 +020012257 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12258 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260
Christian Heimesea71a522013-06-29 21:17:34 +020012261 if (PyUnicode_READY(self) == -1) {
12262 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012264 }
12265 if (PyUnicode_READY(substring) == -1) {
12266 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269
Victor Stinner7931d9a2011-11-04 00:22:48 +010012270 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271
12272 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 if (result == -2)
12275 return NULL;
12276
Christian Heimes217cfd12007-12-02 14:31:20 +000012277 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278}
12279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012280PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012283Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284
12285static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012288 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012289 Py_ssize_t start;
12290 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292
Jesus Ceaac451502011-04-20 17:09:23 +020012293 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12294 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
Christian Heimesea71a522013-06-29 21:17:34 +020012297 if (PyUnicode_READY(self) == -1) {
12298 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012300 }
12301 if (PyUnicode_READY(substring) == -1) {
12302 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305
Victor Stinner7931d9a2011-11-04 00:22:48 +010012306 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
12308 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (result == -2)
12311 return NULL;
12312
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313 if (result < 0) {
12314 PyErr_SetString(PyExc_ValueError, "substring not found");
12315 return NULL;
12316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317
Christian Heimes217cfd12007-12-02 14:31:20 +000012318 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319}
12320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012321PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012324Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012325done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326
12327static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012328unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012330 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 Py_UCS4 fillchar = ' ';
12332
Victor Stinnere9a29352011-10-01 02:14:59 +020012333 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012335
Benjamin Petersonbac79492012-01-14 13:34:47 -050012336 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337 return NULL;
12338
Victor Stinnerc4b49542011-12-11 22:44:26 +010012339 if (PyUnicode_GET_LENGTH(self) >= width)
12340 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341
Victor Stinnerc4b49542011-12-11 22:44:26 +010012342 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343}
12344
Alexander Belopolsky40018472011-02-26 01:02:56 +000012345PyObject *
12346PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347{
12348 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012349
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350 s = PyUnicode_FromObject(s);
12351 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012352 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 if (sep != NULL) {
12354 sep = PyUnicode_FromObject(sep);
12355 if (sep == NULL) {
12356 Py_DECREF(s);
12357 return NULL;
12358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359 }
12360
Victor Stinner9310abb2011-10-05 00:59:23 +020012361 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
12363 Py_DECREF(s);
12364 Py_XDECREF(sep);
12365 return result;
12366}
12367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012368PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012369 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370\n\
12371Return a list of the words in S, using sep as the\n\
12372delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012373splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012374whitespace string is a separator and empty strings are\n\
12375removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376
12377static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012378unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012380 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012382 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012384 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12385 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 return NULL;
12387
12388 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012391 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012393 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394}
12395
Thomas Wouters477c8d52006-05-27 19:21:47 +000012396PyObject *
12397PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12398{
12399 PyObject* str_obj;
12400 PyObject* sep_obj;
12401 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 int kind1, kind2, kind;
12403 void *buf1 = NULL, *buf2 = NULL;
12404 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012405
12406 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012407 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012408 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012409 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012410 if (!sep_obj) {
12411 Py_DECREF(str_obj);
12412 return NULL;
12413 }
12414 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12415 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012416 Py_DECREF(str_obj);
12417 return NULL;
12418 }
12419
Victor Stinner14f8f022011-10-05 20:58:25 +020012420 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012422 kind = Py_MAX(kind1, kind2);
12423 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012425 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 if (!buf1)
12427 goto onError;
12428 buf2 = PyUnicode_DATA(sep_obj);
12429 if (kind2 != kind)
12430 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12431 if (!buf2)
12432 goto onError;
12433 len1 = PyUnicode_GET_LENGTH(str_obj);
12434 len2 = PyUnicode_GET_LENGTH(sep_obj);
12435
Benjamin Petersonead6b532011-12-20 17:23:42 -060012436 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012438 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12439 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12440 else
12441 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 break;
12443 case PyUnicode_2BYTE_KIND:
12444 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12445 break;
12446 case PyUnicode_4BYTE_KIND:
12447 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12448 break;
12449 default:
12450 assert(0);
12451 out = 0;
12452 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453
12454 Py_DECREF(sep_obj);
12455 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 if (kind1 != kind)
12457 PyMem_Free(buf1);
12458 if (kind2 != kind)
12459 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012460
12461 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 onError:
12463 Py_DECREF(sep_obj);
12464 Py_DECREF(str_obj);
12465 if (kind1 != kind && buf1)
12466 PyMem_Free(buf1);
12467 if (kind2 != kind && buf2)
12468 PyMem_Free(buf2);
12469 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012470}
12471
12472
12473PyObject *
12474PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12475{
12476 PyObject* str_obj;
12477 PyObject* sep_obj;
12478 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 int kind1, kind2, kind;
12480 void *buf1 = NULL, *buf2 = NULL;
12481 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012482
12483 str_obj = PyUnicode_FromObject(str_in);
12484 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012485 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012486 sep_obj = PyUnicode_FromObject(sep_in);
12487 if (!sep_obj) {
12488 Py_DECREF(str_obj);
12489 return NULL;
12490 }
12491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 kind1 = PyUnicode_KIND(str_in);
12493 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012494 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 buf1 = PyUnicode_DATA(str_in);
12496 if (kind1 != kind)
12497 buf1 = _PyUnicode_AsKind(str_in, kind);
12498 if (!buf1)
12499 goto onError;
12500 buf2 = PyUnicode_DATA(sep_obj);
12501 if (kind2 != kind)
12502 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12503 if (!buf2)
12504 goto onError;
12505 len1 = PyUnicode_GET_LENGTH(str_obj);
12506 len2 = PyUnicode_GET_LENGTH(sep_obj);
12507
Benjamin Petersonead6b532011-12-20 17:23:42 -060012508 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012510 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12511 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12512 else
12513 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 break;
12515 case PyUnicode_2BYTE_KIND:
12516 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12517 break;
12518 case PyUnicode_4BYTE_KIND:
12519 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12520 break;
12521 default:
12522 assert(0);
12523 out = 0;
12524 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012525
12526 Py_DECREF(sep_obj);
12527 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 if (kind1 != kind)
12529 PyMem_Free(buf1);
12530 if (kind2 != kind)
12531 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012532
12533 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 onError:
12535 Py_DECREF(sep_obj);
12536 Py_DECREF(str_obj);
12537 if (kind1 != kind && buf1)
12538 PyMem_Free(buf1);
12539 if (kind2 != kind && buf2)
12540 PyMem_Free(buf2);
12541 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012542}
12543
12544PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012545 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012546\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012547Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012548the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012549found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012550
12551static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012552unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012553{
Victor Stinner9310abb2011-10-05 00:59:23 +020012554 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012555}
12556
12557PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012558 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012559\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012560Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012561the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012562separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012563
12564static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012565unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012566{
Victor Stinner9310abb2011-10-05 00:59:23 +020012567 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012568}
12569
Alexander Belopolsky40018472011-02-26 01:02:56 +000012570PyObject *
12571PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012572{
12573 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012574
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012575 s = PyUnicode_FromObject(s);
12576 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012577 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 if (sep != NULL) {
12579 sep = PyUnicode_FromObject(sep);
12580 if (sep == NULL) {
12581 Py_DECREF(s);
12582 return NULL;
12583 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012584 }
12585
Victor Stinner9310abb2011-10-05 00:59:23 +020012586 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012587
12588 Py_DECREF(s);
12589 Py_XDECREF(sep);
12590 return result;
12591}
12592
12593PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012594 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012595\n\
12596Return a list of the words in S, using sep as the\n\
12597delimiter string, starting at the end of the string and\n\
12598working to the front. If maxsplit is given, at most maxsplit\n\
12599splits are done. If sep is not specified, any whitespace string\n\
12600is a separator.");
12601
12602static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012603unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012604{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012605 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012606 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012607 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012608
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012609 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12610 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012611 return NULL;
12612
12613 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012615 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012616 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012617 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012618 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012619}
12620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012621PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623\n\
12624Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012625Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012626is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
12628static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012629unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012631 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012632 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012634 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12635 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 return NULL;
12637
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012638 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639}
12640
12641static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012642PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012644 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645}
12646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012647PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012648 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649\n\
12650Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012651and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652
12653static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012654unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012656 if (PyUnicode_READY(self) == -1)
12657 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012658 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659}
12660
Larry Hastings31826802013-10-19 00:09:25 -070012661/*[clinic]
12662module str
Georg Brandlceee0772007-11-27 23:48:05 +000012663
Larry Hastings31826802013-10-19 00:09:25 -070012664@staticmethod
12665str.maketrans as unicode_maketrans
12666
12667 x: object
12668
12669 y: unicode=NULL
12670
12671 z: unicode=NULL
12672
12673 /
12674
12675Return a translation table usable for str.translate().
12676
12677If there is only one argument, it must be a dictionary mapping Unicode
12678ordinals (integers) or characters to Unicode ordinals, strings or None.
12679Character keys will be then converted to ordinals.
12680If there are two arguments, they must be strings of equal length, and
12681in the resulting dictionary, each character in x will be mapped to the
12682character at the same position in y. If there is a third argument, it
12683must be a string, whose characters will be mapped to None in the result.
12684[clinic]*/
12685
12686PyDoc_STRVAR(unicode_maketrans__doc__,
12687"Return a translation table usable for str.translate().\n"
12688"\n"
12689"str.maketrans(x, y=None, z=None)\n"
12690"\n"
12691"If there is only one argument, it must be a dictionary mapping Unicode\n"
12692"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12693"Character keys will be then converted to ordinals.\n"
12694"If there are two arguments, they must be strings of equal length, and\n"
12695"in the resulting dictionary, each character in x will be mapped to the\n"
12696"character at the same position in y. If there is a third argument, it\n"
12697"must be a string, whose characters will be mapped to None in the result.");
12698
12699#define UNICODE_MAKETRANS_METHODDEF \
12700 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12701
12702static PyObject *
12703unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
12704
12705static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012706unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012707{
Larry Hastings31826802013-10-19 00:09:25 -070012708 PyObject *return_value = NULL;
12709 PyObject *x;
12710 PyObject *y = NULL;
12711 PyObject *z = NULL;
12712
12713 if (!PyArg_ParseTuple(args,
12714 "O|UU:maketrans",
12715 &x, &y, &z))
12716 goto exit;
12717 return_value = unicode_maketrans_impl(x, y, z);
12718
12719exit:
12720 return return_value;
12721}
12722
12723static PyObject *
12724unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12725/*[clinic checksum: 137db9c3199e7906b7967009f511c24fa3235b5f]*/
12726{
Georg Brandlceee0772007-11-27 23:48:05 +000012727 PyObject *new = NULL, *key, *value;
12728 Py_ssize_t i = 0;
12729 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012730
Georg Brandlceee0772007-11-27 23:48:05 +000012731 new = PyDict_New();
12732 if (!new)
12733 return NULL;
12734 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 int x_kind, y_kind, z_kind;
12736 void *x_data, *y_data, *z_data;
12737
Georg Brandlceee0772007-11-27 23:48:05 +000012738 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012739 if (!PyUnicode_Check(x)) {
12740 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12741 "be a string if there is a second argument");
12742 goto err;
12743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012745 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12746 "arguments must have equal length");
12747 goto err;
12748 }
12749 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 x_kind = PyUnicode_KIND(x);
12751 y_kind = PyUnicode_KIND(y);
12752 x_data = PyUnicode_DATA(x);
12753 y_data = PyUnicode_DATA(y);
12754 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12755 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012756 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012757 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012758 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012759 if (!value) {
12760 Py_DECREF(key);
12761 goto err;
12762 }
Georg Brandlceee0772007-11-27 23:48:05 +000012763 res = PyDict_SetItem(new, key, value);
12764 Py_DECREF(key);
12765 Py_DECREF(value);
12766 if (res < 0)
12767 goto err;
12768 }
12769 /* create entries for deleting chars in z */
12770 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771 z_kind = PyUnicode_KIND(z);
12772 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012773 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012775 if (!key)
12776 goto err;
12777 res = PyDict_SetItem(new, key, Py_None);
12778 Py_DECREF(key);
12779 if (res < 0)
12780 goto err;
12781 }
12782 }
12783 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 int kind;
12785 void *data;
12786
Georg Brandlceee0772007-11-27 23:48:05 +000012787 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012788 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012789 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12790 "to maketrans it must be a dict");
12791 goto err;
12792 }
12793 /* copy entries into the new dict, converting string keys to int keys */
12794 while (PyDict_Next(x, &i, &key, &value)) {
12795 if (PyUnicode_Check(key)) {
12796 /* convert string keys to integer keys */
12797 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012798 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012799 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12800 "table must be of length 1");
12801 goto err;
12802 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 kind = PyUnicode_KIND(key);
12804 data = PyUnicode_DATA(key);
12805 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012806 if (!newkey)
12807 goto err;
12808 res = PyDict_SetItem(new, newkey, value);
12809 Py_DECREF(newkey);
12810 if (res < 0)
12811 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012812 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012813 /* just keep integer keys */
12814 if (PyDict_SetItem(new, key, value) < 0)
12815 goto err;
12816 } else {
12817 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12818 "be strings or integers");
12819 goto err;
12820 }
12821 }
12822 }
12823 return new;
12824 err:
12825 Py_DECREF(new);
12826 return NULL;
12827}
12828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012829PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831\n\
12832Return a copy of the string S, where all characters have been mapped\n\
12833through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012834Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012835Unmapped characters are left untouched. Characters mapped to None\n\
12836are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837
12838static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842}
12843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012844PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012847Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848
12849static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012850unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012852 if (PyUnicode_READY(self) == -1)
12853 return NULL;
12854 if (PyUnicode_IS_ASCII(self))
12855 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012856 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857}
12858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012859PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012860 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012862Pad a numeric string S with zeros on the left, to fill a field\n\
12863of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864
12865static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012866unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012868 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012869 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012870 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 int kind;
12872 void *data;
12873 Py_UCS4 chr;
12874
Martin v. Löwis18e16552006-02-15 17:27:45 +000012875 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876 return NULL;
12877
Benjamin Petersonbac79492012-01-14 13:34:47 -050012878 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880
Victor Stinnerc4b49542011-12-11 22:44:26 +010012881 if (PyUnicode_GET_LENGTH(self) >= width)
12882 return unicode_result_unchanged(self);
12883
12884 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885
12886 u = pad(self, fill, 0, '0');
12887
Walter Dörwald068325e2002-04-15 13:36:47 +000012888 if (u == NULL)
12889 return NULL;
12890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 kind = PyUnicode_KIND(u);
12892 data = PyUnicode_DATA(u);
12893 chr = PyUnicode_READ(kind, data, fill);
12894
12895 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 PyUnicode_WRITE(kind, data, 0, chr);
12898 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899 }
12900
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012901 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012902 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904
12905#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012906static PyObject *
12907unicode__decimal2ascii(PyObject *self)
12908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012910}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911#endif
12912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012913PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012914 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012916Return True if S starts with the specified prefix, False otherwise.\n\
12917With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012918With optional end, stop comparing S at that position.\n\
12919prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012920
12921static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012922unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012925 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012926 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012927 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012928 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012929 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930
Jesus Ceaac451502011-04-20 17:09:23 +020012931 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012933 if (PyTuple_Check(subobj)) {
12934 Py_ssize_t i;
12935 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012936 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012937 if (substring == NULL)
12938 return NULL;
12939 result = tailmatch(self, substring, start, end, -1);
12940 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012941 if (result == -1)
12942 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012943 if (result) {
12944 Py_RETURN_TRUE;
12945 }
12946 }
12947 /* nothing matched */
12948 Py_RETURN_FALSE;
12949 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012950 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012951 if (substring == NULL) {
12952 if (PyErr_ExceptionMatches(PyExc_TypeError))
12953 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12954 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012955 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012956 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012957 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012959 if (result == -1)
12960 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012961 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962}
12963
12964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012965PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012966 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012968Return True if S ends with the specified suffix, False otherwise.\n\
12969With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012970With optional end, stop comparing S at that position.\n\
12971suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012972
12973static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012974unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012975 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012977 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012978 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012979 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012980 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012981 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012982
Jesus Ceaac451502011-04-20 17:09:23 +020012983 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012984 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012985 if (PyTuple_Check(subobj)) {
12986 Py_ssize_t i;
12987 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012988 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012990 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012992 result = tailmatch(self, substring, start, end, +1);
12993 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012994 if (result == -1)
12995 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012996 if (result) {
12997 Py_RETURN_TRUE;
12998 }
12999 }
13000 Py_RETURN_FALSE;
13001 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013002 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013003 if (substring == NULL) {
13004 if (PyErr_ExceptionMatches(PyExc_TypeError))
13005 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13006 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013007 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013008 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013009 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013010 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013011 if (result == -1)
13012 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013013 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014}
13015
Victor Stinner202fdca2012-05-07 12:47:02 +020013016Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013017_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013018{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013019 if (!writer->readonly)
13020 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13021 else {
13022 /* Copy-on-write mode: set buffer size to 0 so
13023 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13024 * next write. */
13025 writer->size = 0;
13026 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013027 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13028 writer->data = PyUnicode_DATA(writer->buffer);
13029 writer->kind = PyUnicode_KIND(writer->buffer);
13030}
13031
Victor Stinnerd3f08822012-05-29 12:57:52 +020013032void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013033_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013034{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013035 memset(writer, 0, sizeof(*writer));
13036#ifdef Py_DEBUG
13037 writer->kind = 5; /* invalid kind */
13038#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013039 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013040}
13041
Victor Stinnerd3f08822012-05-29 12:57:52 +020013042int
13043_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13044 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013045{
13046 Py_ssize_t newlen;
13047 PyObject *newbuffer;
13048
Victor Stinnerd3f08822012-05-29 12:57:52 +020013049 assert(length > 0);
13050
Victor Stinner202fdca2012-05-07 12:47:02 +020013051 if (length > PY_SSIZE_T_MAX - writer->pos) {
13052 PyErr_NoMemory();
13053 return -1;
13054 }
13055 newlen = writer->pos + length;
13056
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013057 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013058
Victor Stinnerd3f08822012-05-29 12:57:52 +020013059 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013060 assert(!writer->readonly);
13061 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013062 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013063 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013064 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013065 if (newlen < writer->min_length)
13066 newlen = writer->min_length;
13067
Victor Stinnerd3f08822012-05-29 12:57:52 +020013068 writer->buffer = PyUnicode_New(newlen, maxchar);
13069 if (writer->buffer == NULL)
13070 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013071 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013072 else if (newlen > writer->size) {
13073 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013074 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013075 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013076 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013077 if (newlen < writer->min_length)
13078 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013079
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013080 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013081 /* resize + widen */
13082 newbuffer = PyUnicode_New(newlen, maxchar);
13083 if (newbuffer == NULL)
13084 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013085 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13086 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013087 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013088 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013089 }
13090 else {
13091 newbuffer = resize_compact(writer->buffer, newlen);
13092 if (newbuffer == NULL)
13093 return -1;
13094 }
13095 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013096 }
13097 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013098 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013099 newbuffer = PyUnicode_New(writer->size, maxchar);
13100 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013101 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013102 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13103 writer->buffer, 0, writer->pos);
13104 Py_DECREF(writer->buffer);
13105 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013106 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013107 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013108 return 0;
13109}
13110
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013111Py_LOCAL_INLINE(int)
13112_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013113{
13114 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13115 return -1;
13116 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13117 writer->pos++;
13118 return 0;
13119}
13120
13121int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013122_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13123{
13124 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13125}
13126
13127int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013128_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13129{
13130 Py_UCS4 maxchar;
13131 Py_ssize_t len;
13132
13133 if (PyUnicode_READY(str) == -1)
13134 return -1;
13135 len = PyUnicode_GET_LENGTH(str);
13136 if (len == 0)
13137 return 0;
13138 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13139 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013140 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013141 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013142 Py_INCREF(str);
13143 writer->buffer = str;
13144 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013145 writer->pos += len;
13146 return 0;
13147 }
13148 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13149 return -1;
13150 }
13151 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13152 str, 0, len);
13153 writer->pos += len;
13154 return 0;
13155}
13156
Victor Stinnere215d962012-10-06 23:03:36 +020013157int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013158_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13159 Py_ssize_t start, Py_ssize_t end)
13160{
13161 Py_UCS4 maxchar;
13162 Py_ssize_t len;
13163
13164 if (PyUnicode_READY(str) == -1)
13165 return -1;
13166
13167 assert(0 <= start);
13168 assert(end <= PyUnicode_GET_LENGTH(str));
13169 assert(start <= end);
13170
13171 if (end == 0)
13172 return 0;
13173
13174 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13175 return _PyUnicodeWriter_WriteStr(writer, str);
13176
13177 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13178 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13179 else
13180 maxchar = writer->maxchar;
13181 len = end - start;
13182
13183 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13184 return -1;
13185
13186 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13187 str, start, len);
13188 writer->pos += len;
13189 return 0;
13190}
13191
13192int
Victor Stinnere215d962012-10-06 23:03:36 +020013193_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13194{
13195 Py_UCS4 maxchar;
13196
13197 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13198 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13199 return -1;
13200 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13201 writer->pos += len;
13202 return 0;
13203}
13204
Victor Stinnerd3f08822012-05-29 12:57:52 +020013205PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013206_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013207{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013208 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013209 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013210 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013211 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013212 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013213 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013214 str = writer->buffer;
13215 writer->buffer = NULL;
13216 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13217 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013218 }
13219 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13220 PyObject *newbuffer;
13221 newbuffer = resize_compact(writer->buffer, writer->pos);
13222 if (newbuffer == NULL) {
13223 Py_DECREF(writer->buffer);
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013224 writer->buffer = NULL;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013225 return NULL;
13226 }
13227 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013228 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013229 str = writer->buffer;
13230 writer->buffer = NULL;
13231 assert(_PyUnicode_CheckConsistency(str, 1));
13232 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013233}
13234
Victor Stinnerd3f08822012-05-29 12:57:52 +020013235void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013236_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013237{
13238 Py_CLEAR(writer->buffer);
13239}
13240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013241#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013242
13243PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013244 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013245\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013246Return a formatted version of S, using substitutions from args and kwargs.\n\
13247The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013248
Eric Smith27bbca62010-11-04 17:06:58 +000013249PyDoc_STRVAR(format_map__doc__,
13250 "S.format_map(mapping) -> str\n\
13251\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013252Return a formatted version of S, using substitutions from mapping.\n\
13253The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013254
Eric Smith4a7d76d2008-05-30 18:10:19 +000013255static PyObject *
13256unicode__format__(PyObject* self, PyObject* args)
13257{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013258 PyObject *format_spec;
13259 _PyUnicodeWriter writer;
13260 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013261
13262 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13263 return NULL;
13264
Victor Stinnerd3f08822012-05-29 12:57:52 +020013265 if (PyUnicode_READY(self) == -1)
13266 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013267 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013268 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13269 self, format_spec, 0,
13270 PyUnicode_GET_LENGTH(format_spec));
13271 if (ret == -1) {
13272 _PyUnicodeWriter_Dealloc(&writer);
13273 return NULL;
13274 }
13275 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013276}
13277
Eric Smith8c663262007-08-25 02:26:07 +000013278PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013279 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013280\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013281Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013282
13283static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013284unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 Py_ssize_t size;
13287
13288 /* If it's a compact object, account for base structure +
13289 character data. */
13290 if (PyUnicode_IS_COMPACT_ASCII(v))
13291 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13292 else if (PyUnicode_IS_COMPACT(v))
13293 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013294 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295 else {
13296 /* If it is a two-block object, account for base object, and
13297 for character block if present. */
13298 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013299 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013300 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013301 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 }
13303 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013304 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013305 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013307 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013308 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309
13310 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013311}
13312
13313PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013315
13316static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013317unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013318{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013319 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 if (!copy)
13321 return NULL;
13322 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013323}
13324
Guido van Rossumd57fd912000-03-10 22:53:23 +000013325static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013326 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013327 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013328 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13329 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013330 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13331 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013332 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013333 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13334 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13335 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13336 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13337 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013338 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013339 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13340 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13341 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013342 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013343 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13344 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13345 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013346 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013347 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013348 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013349 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013350 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13351 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13352 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13353 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13354 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13355 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13356 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13357 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13358 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13359 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13360 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13361 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13362 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13363 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013364 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013365 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013366 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013367 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013368 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013369 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013370 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013371 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013372#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013373 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013374 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375#endif
13376
Benjamin Peterson14339b62009-01-31 16:36:08 +000013377 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378 {NULL, NULL}
13379};
13380
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013381static PyObject *
13382unicode_mod(PyObject *v, PyObject *w)
13383{
Brian Curtindfc80e32011-08-10 20:28:54 -050013384 if (!PyUnicode_Check(v))
13385 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013386 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013387}
13388
13389static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013390 0, /*nb_add*/
13391 0, /*nb_subtract*/
13392 0, /*nb_multiply*/
13393 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013394};
13395
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013397 (lenfunc) unicode_length, /* sq_length */
13398 PyUnicode_Concat, /* sq_concat */
13399 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13400 (ssizeargfunc) unicode_getitem, /* sq_item */
13401 0, /* sq_slice */
13402 0, /* sq_ass_item */
13403 0, /* sq_ass_slice */
13404 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013405};
13406
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013407static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013408unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 if (PyUnicode_READY(self) == -1)
13411 return NULL;
13412
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013413 if (PyIndex_Check(item)) {
13414 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013415 if (i == -1 && PyErr_Occurred())
13416 return NULL;
13417 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013418 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013419 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013420 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013421 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013422 PyObject *result;
13423 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013424 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013425 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013427 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013429 return NULL;
13430 }
13431
13432 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013433 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013434 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013435 slicelength == PyUnicode_GET_LENGTH(self)) {
13436 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013437 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013438 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013439 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013440 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013441 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013442 src_kind = PyUnicode_KIND(self);
13443 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013444 if (!PyUnicode_IS_ASCII(self)) {
13445 kind_limit = kind_maxchar_limit(src_kind);
13446 max_char = 0;
13447 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13448 ch = PyUnicode_READ(src_kind, src_data, cur);
13449 if (ch > max_char) {
13450 max_char = ch;
13451 if (max_char >= kind_limit)
13452 break;
13453 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013454 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013455 }
Victor Stinner55c99112011-10-13 01:17:06 +020013456 else
13457 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013458 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013459 if (result == NULL)
13460 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013461 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013462 dest_data = PyUnicode_DATA(result);
13463
13464 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013465 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13466 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013467 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013468 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013469 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013470 } else {
13471 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13472 return NULL;
13473 }
13474}
13475
13476static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013477 (lenfunc)unicode_length, /* mp_length */
13478 (binaryfunc)unicode_subscript, /* mp_subscript */
13479 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013480};
13481
Guido van Rossumd57fd912000-03-10 22:53:23 +000013482
Guido van Rossumd57fd912000-03-10 22:53:23 +000013483/* Helpers for PyUnicode_Format() */
13484
Victor Stinnera47082312012-10-04 02:19:54 +020013485struct unicode_formatter_t {
13486 PyObject *args;
13487 int args_owned;
13488 Py_ssize_t arglen, argidx;
13489 PyObject *dict;
13490
13491 enum PyUnicode_Kind fmtkind;
13492 Py_ssize_t fmtcnt, fmtpos;
13493 void *fmtdata;
13494 PyObject *fmtstr;
13495
13496 _PyUnicodeWriter writer;
13497};
13498
13499struct unicode_format_arg_t {
13500 Py_UCS4 ch;
13501 int flags;
13502 Py_ssize_t width;
13503 int prec;
13504 int sign;
13505};
13506
Guido van Rossumd57fd912000-03-10 22:53:23 +000013507static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013508unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013509{
Victor Stinnera47082312012-10-04 02:19:54 +020013510 Py_ssize_t argidx = ctx->argidx;
13511
13512 if (argidx < ctx->arglen) {
13513 ctx->argidx++;
13514 if (ctx->arglen < 0)
13515 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 else
Victor Stinnera47082312012-10-04 02:19:54 +020013517 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013518 }
13519 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013521 return NULL;
13522}
13523
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013524/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013525
Victor Stinnera47082312012-10-04 02:19:54 +020013526/* Format a float into the writer if the writer is not NULL, or into *p_output
13527 otherwise.
13528
13529 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530static int
Victor Stinnera47082312012-10-04 02:19:54 +020013531formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13532 PyObject **p_output,
13533 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013534{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013535 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013536 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013537 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013538 int prec;
13539 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013540
Guido van Rossumd57fd912000-03-10 22:53:23 +000013541 x = PyFloat_AsDouble(v);
13542 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013543 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013544
Victor Stinnera47082312012-10-04 02:19:54 +020013545 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013546 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013547 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013548
Victor Stinnera47082312012-10-04 02:19:54 +020013549 if (arg->flags & F_ALT)
13550 dtoa_flags = Py_DTSF_ALT;
13551 else
13552 dtoa_flags = 0;
13553 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013554 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 return -1;
13556 len = strlen(p);
13557 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013558 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13559 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013561 }
Victor Stinner184252a2012-06-16 02:57:41 +020013562 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013563 writer->pos += len;
13564 }
13565 else
13566 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013567 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013568 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013569}
13570
Victor Stinnerd0880d52012-04-27 23:40:13 +020013571/* formatlong() emulates the format codes d, u, o, x and X, and
13572 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13573 * Python's regular ints.
13574 * Return value: a new PyUnicodeObject*, or NULL if error.
13575 * The output string is of the form
13576 * "-"? ("0x" | "0X")? digit+
13577 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13578 * set in flags. The case of hex digits will be correct,
13579 * There will be at least prec digits, zero-filled on the left if
13580 * necessary to get that many.
13581 * val object to be converted
13582 * flags bitmask of format flags; only F_ALT is looked at
13583 * prec minimum number of digits; 0-fill on left if needed
13584 * type a character in [duoxX]; u acts the same as d
13585 *
13586 * CAUTION: o, x and X conversions on regular ints can never
13587 * produce a '-' sign, but can for Python's unbounded ints.
13588 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013589static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013590formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013591{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013592 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013593 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013594 Py_ssize_t i;
13595 int sign; /* 1 if '-', else 0 */
13596 int len; /* number of characters */
13597 Py_ssize_t llen;
13598 int numdigits; /* len == numnondigits + numdigits */
13599 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013600 int prec = arg->prec;
13601 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013602
Victor Stinnerd0880d52012-04-27 23:40:13 +020013603 /* Avoid exceeding SSIZE_T_MAX */
13604 if (prec > INT_MAX-3) {
13605 PyErr_SetString(PyExc_OverflowError,
13606 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013607 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013608 }
13609
13610 assert(PyLong_Check(val));
13611
13612 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013613 default:
13614 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013615 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013616 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013617 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013618 /* int and int subclasses should print numerically when a numeric */
13619 /* format code is used (see issue18780) */
13620 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013621 break;
13622 case 'o':
13623 numnondigits = 2;
13624 result = PyNumber_ToBase(val, 8);
13625 break;
13626 case 'x':
13627 case 'X':
13628 numnondigits = 2;
13629 result = PyNumber_ToBase(val, 16);
13630 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013631 }
13632 if (!result)
13633 return NULL;
13634
13635 assert(unicode_modifiable(result));
13636 assert(PyUnicode_IS_READY(result));
13637 assert(PyUnicode_IS_ASCII(result));
13638
13639 /* To modify the string in-place, there can only be one reference. */
13640 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013641 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013642 PyErr_BadInternalCall();
13643 return NULL;
13644 }
13645 buf = PyUnicode_DATA(result);
13646 llen = PyUnicode_GET_LENGTH(result);
13647 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013648 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013649 PyErr_SetString(PyExc_ValueError,
13650 "string too large in _PyBytes_FormatLong");
13651 return NULL;
13652 }
13653 len = (int)llen;
13654 sign = buf[0] == '-';
13655 numnondigits += sign;
13656 numdigits = len - numnondigits;
13657 assert(numdigits > 0);
13658
13659 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013660 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013661 (type == 'o' || type == 'x' || type == 'X'))) {
13662 assert(buf[sign] == '0');
13663 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13664 buf[sign+1] == 'o');
13665 numnondigits -= 2;
13666 buf += 2;
13667 len -= 2;
13668 if (sign)
13669 buf[0] = '-';
13670 assert(len == numnondigits + numdigits);
13671 assert(numdigits > 0);
13672 }
13673
13674 /* Fill with leading zeroes to meet minimum width. */
13675 if (prec > numdigits) {
13676 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13677 numnondigits + prec);
13678 char *b1;
13679 if (!r1) {
13680 Py_DECREF(result);
13681 return NULL;
13682 }
13683 b1 = PyBytes_AS_STRING(r1);
13684 for (i = 0; i < numnondigits; ++i)
13685 *b1++ = *buf++;
13686 for (i = 0; i < prec - numdigits; i++)
13687 *b1++ = '0';
13688 for (i = 0; i < numdigits; i++)
13689 *b1++ = *buf++;
13690 *b1 = '\0';
13691 Py_DECREF(result);
13692 result = r1;
13693 buf = PyBytes_AS_STRING(result);
13694 len = numnondigits + prec;
13695 }
13696
13697 /* Fix up case for hex conversions. */
13698 if (type == 'X') {
13699 /* Need to convert all lower case letters to upper case.
13700 and need to convert 0x to 0X (and -0x to -0X). */
13701 for (i = 0; i < len; i++)
13702 if (buf[i] >= 'a' && buf[i] <= 'x')
13703 buf[i] -= 'a'-'A';
13704 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013705 if (!PyUnicode_Check(result)
13706 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013707 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013708 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013709 Py_DECREF(result);
13710 result = unicode;
13711 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013712 else if (len != PyUnicode_GET_LENGTH(result)) {
13713 if (PyUnicode_Resize(&result, len) < 0)
13714 Py_CLEAR(result);
13715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013716 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013717}
13718
Victor Stinner621ef3d2012-10-02 00:33:47 +020013719/* Format an integer.
13720 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013721 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013722 * -1 and raise an exception on error */
13723static int
Victor Stinnera47082312012-10-04 02:19:54 +020013724mainformatlong(PyObject *v,
13725 struct unicode_format_arg_t *arg,
13726 PyObject **p_output,
13727 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013728{
13729 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013730 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013731
13732 if (!PyNumber_Check(v))
13733 goto wrongtype;
13734
13735 if (!PyLong_Check(v)) {
13736 iobj = PyNumber_Long(v);
13737 if (iobj == NULL) {
13738 if (PyErr_ExceptionMatches(PyExc_TypeError))
13739 goto wrongtype;
13740 return -1;
13741 }
13742 assert(PyLong_Check(iobj));
13743 }
13744 else {
13745 iobj = v;
13746 Py_INCREF(iobj);
13747 }
13748
13749 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013750 && arg->width == -1 && arg->prec == -1
13751 && !(arg->flags & (F_SIGN | F_BLANK))
13752 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013753 {
13754 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013755 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013756 int base;
13757
Victor Stinnera47082312012-10-04 02:19:54 +020013758 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013759 {
13760 default:
13761 assert(0 && "'type' not in [diuoxX]");
13762 case 'd':
13763 case 'i':
13764 case 'u':
13765 base = 10;
13766 break;
13767 case 'o':
13768 base = 8;
13769 break;
13770 case 'x':
13771 case 'X':
13772 base = 16;
13773 break;
13774 }
13775
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013776 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13777 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013778 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013779 }
13780 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013781 return 1;
13782 }
13783
Victor Stinnera47082312012-10-04 02:19:54 +020013784 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013785 Py_DECREF(iobj);
13786 if (res == NULL)
13787 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013788 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013789 return 0;
13790
13791wrongtype:
13792 PyErr_Format(PyExc_TypeError,
13793 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013794 "not %.200s",
13795 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013796 return -1;
13797}
13798
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013799static Py_UCS4
13800formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013801{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013802 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013803 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013804 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013805 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013806 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013807 goto onError;
13808 }
13809 else {
13810 /* Integer input truncated to a character */
13811 long x;
13812 x = PyLong_AsLong(v);
13813 if (x == -1 && PyErr_Occurred())
13814 goto onError;
13815
Victor Stinner8faf8212011-12-08 22:14:11 +010013816 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013817 PyErr_SetString(PyExc_OverflowError,
13818 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013819 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013820 }
13821
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013822 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013823 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013824
Benjamin Peterson29060642009-01-31 22:14:21 +000013825 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013826 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013827 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013828 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013829}
13830
Victor Stinnera47082312012-10-04 02:19:54 +020013831/* Parse options of an argument: flags, width, precision.
13832 Handle also "%(name)" syntax.
13833
13834 Return 0 if the argument has been formatted into arg->str.
13835 Return 1 if the argument has been written into ctx->writer,
13836 Raise an exception and return -1 on error. */
13837static int
13838unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13839 struct unicode_format_arg_t *arg)
13840{
13841#define FORMAT_READ(ctx) \
13842 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13843
13844 PyObject *v;
13845
Victor Stinnera47082312012-10-04 02:19:54 +020013846 if (arg->ch == '(') {
13847 /* Get argument value from a dictionary. Example: "%(name)s". */
13848 Py_ssize_t keystart;
13849 Py_ssize_t keylen;
13850 PyObject *key;
13851 int pcount = 1;
13852
13853 if (ctx->dict == NULL) {
13854 PyErr_SetString(PyExc_TypeError,
13855 "format requires a mapping");
13856 return -1;
13857 }
13858 ++ctx->fmtpos;
13859 --ctx->fmtcnt;
13860 keystart = ctx->fmtpos;
13861 /* Skip over balanced parentheses */
13862 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13863 arg->ch = FORMAT_READ(ctx);
13864 if (arg->ch == ')')
13865 --pcount;
13866 else if (arg->ch == '(')
13867 ++pcount;
13868 ctx->fmtpos++;
13869 }
13870 keylen = ctx->fmtpos - keystart - 1;
13871 if (ctx->fmtcnt < 0 || pcount > 0) {
13872 PyErr_SetString(PyExc_ValueError,
13873 "incomplete format key");
13874 return -1;
13875 }
13876 key = PyUnicode_Substring(ctx->fmtstr,
13877 keystart, keystart + keylen);
13878 if (key == NULL)
13879 return -1;
13880 if (ctx->args_owned) {
13881 Py_DECREF(ctx->args);
13882 ctx->args_owned = 0;
13883 }
13884 ctx->args = PyObject_GetItem(ctx->dict, key);
13885 Py_DECREF(key);
13886 if (ctx->args == NULL)
13887 return -1;
13888 ctx->args_owned = 1;
13889 ctx->arglen = -1;
13890 ctx->argidx = -2;
13891 }
13892
13893 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013894 while (--ctx->fmtcnt >= 0) {
13895 arg->ch = FORMAT_READ(ctx);
13896 ctx->fmtpos++;
13897 switch (arg->ch) {
13898 case '-': arg->flags |= F_LJUST; continue;
13899 case '+': arg->flags |= F_SIGN; continue;
13900 case ' ': arg->flags |= F_BLANK; continue;
13901 case '#': arg->flags |= F_ALT; continue;
13902 case '0': arg->flags |= F_ZERO; continue;
13903 }
13904 break;
13905 }
13906
13907 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013908 if (arg->ch == '*') {
13909 v = unicode_format_getnextarg(ctx);
13910 if (v == NULL)
13911 return -1;
13912 if (!PyLong_Check(v)) {
13913 PyErr_SetString(PyExc_TypeError,
13914 "* wants int");
13915 return -1;
13916 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013917 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013918 if (arg->width == -1 && PyErr_Occurred())
13919 return -1;
13920 if (arg->width < 0) {
13921 arg->flags |= F_LJUST;
13922 arg->width = -arg->width;
13923 }
13924 if (--ctx->fmtcnt >= 0) {
13925 arg->ch = FORMAT_READ(ctx);
13926 ctx->fmtpos++;
13927 }
13928 }
13929 else if (arg->ch >= '0' && arg->ch <= '9') {
13930 arg->width = arg->ch - '0';
13931 while (--ctx->fmtcnt >= 0) {
13932 arg->ch = FORMAT_READ(ctx);
13933 ctx->fmtpos++;
13934 if (arg->ch < '0' || arg->ch > '9')
13935 break;
13936 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13937 mixing signed and unsigned comparison. Since arg->ch is between
13938 '0' and '9', casting to int is safe. */
13939 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13940 PyErr_SetString(PyExc_ValueError,
13941 "width too big");
13942 return -1;
13943 }
13944 arg->width = arg->width*10 + (arg->ch - '0');
13945 }
13946 }
13947
13948 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013949 if (arg->ch == '.') {
13950 arg->prec = 0;
13951 if (--ctx->fmtcnt >= 0) {
13952 arg->ch = FORMAT_READ(ctx);
13953 ctx->fmtpos++;
13954 }
13955 if (arg->ch == '*') {
13956 v = unicode_format_getnextarg(ctx);
13957 if (v == NULL)
13958 return -1;
13959 if (!PyLong_Check(v)) {
13960 PyErr_SetString(PyExc_TypeError,
13961 "* wants int");
13962 return -1;
13963 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013964 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013965 if (arg->prec == -1 && PyErr_Occurred())
13966 return -1;
13967 if (arg->prec < 0)
13968 arg->prec = 0;
13969 if (--ctx->fmtcnt >= 0) {
13970 arg->ch = FORMAT_READ(ctx);
13971 ctx->fmtpos++;
13972 }
13973 }
13974 else if (arg->ch >= '0' && arg->ch <= '9') {
13975 arg->prec = arg->ch - '0';
13976 while (--ctx->fmtcnt >= 0) {
13977 arg->ch = FORMAT_READ(ctx);
13978 ctx->fmtpos++;
13979 if (arg->ch < '0' || arg->ch > '9')
13980 break;
13981 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13982 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013983 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013984 return -1;
13985 }
13986 arg->prec = arg->prec*10 + (arg->ch - '0');
13987 }
13988 }
13989 }
13990
13991 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13992 if (ctx->fmtcnt >= 0) {
13993 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13994 if (--ctx->fmtcnt >= 0) {
13995 arg->ch = FORMAT_READ(ctx);
13996 ctx->fmtpos++;
13997 }
13998 }
13999 }
14000 if (ctx->fmtcnt < 0) {
14001 PyErr_SetString(PyExc_ValueError,
14002 "incomplete format");
14003 return -1;
14004 }
14005 return 0;
14006
14007#undef FORMAT_READ
14008}
14009
14010/* Format one argument. Supported conversion specifiers:
14011
14012 - "s", "r", "a": any type
14013 - "i", "d", "u", "o", "x", "X": int
14014 - "e", "E", "f", "F", "g", "G": float
14015 - "c": int or str (1 character)
14016
Victor Stinner8dbd4212012-12-04 09:30:24 +010014017 When possible, the output is written directly into the Unicode writer
14018 (ctx->writer). A string is created when padding is required.
14019
Victor Stinnera47082312012-10-04 02:19:54 +020014020 Return 0 if the argument has been formatted into *p_str,
14021 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014022 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014023static int
14024unicode_format_arg_format(struct unicode_formatter_t *ctx,
14025 struct unicode_format_arg_t *arg,
14026 PyObject **p_str)
14027{
14028 PyObject *v;
14029 _PyUnicodeWriter *writer = &ctx->writer;
14030
14031 if (ctx->fmtcnt == 0)
14032 ctx->writer.overallocate = 0;
14033
14034 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014035 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014036 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014037 return 1;
14038 }
14039
14040 v = unicode_format_getnextarg(ctx);
14041 if (v == NULL)
14042 return -1;
14043
Victor Stinnera47082312012-10-04 02:19:54 +020014044
14045 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014046 case 's':
14047 case 'r':
14048 case 'a':
14049 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14050 /* Fast path */
14051 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14052 return -1;
14053 return 1;
14054 }
14055
14056 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14057 *p_str = v;
14058 Py_INCREF(*p_str);
14059 }
14060 else {
14061 if (arg->ch == 's')
14062 *p_str = PyObject_Str(v);
14063 else if (arg->ch == 'r')
14064 *p_str = PyObject_Repr(v);
14065 else
14066 *p_str = PyObject_ASCII(v);
14067 }
14068 break;
14069
14070 case 'i':
14071 case 'd':
14072 case 'u':
14073 case 'o':
14074 case 'x':
14075 case 'X':
14076 {
14077 int ret = mainformatlong(v, arg, p_str, writer);
14078 if (ret != 0)
14079 return ret;
14080 arg->sign = 1;
14081 break;
14082 }
14083
14084 case 'e':
14085 case 'E':
14086 case 'f':
14087 case 'F':
14088 case 'g':
14089 case 'G':
14090 if (arg->width == -1 && arg->prec == -1
14091 && !(arg->flags & (F_SIGN | F_BLANK)))
14092 {
14093 /* Fast path */
14094 if (formatfloat(v, arg, NULL, writer) == -1)
14095 return -1;
14096 return 1;
14097 }
14098
14099 arg->sign = 1;
14100 if (formatfloat(v, arg, p_str, NULL) == -1)
14101 return -1;
14102 break;
14103
14104 case 'c':
14105 {
14106 Py_UCS4 ch = formatchar(v);
14107 if (ch == (Py_UCS4) -1)
14108 return -1;
14109 if (arg->width == -1 && arg->prec == -1) {
14110 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014111 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014112 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014113 return 1;
14114 }
14115 *p_str = PyUnicode_FromOrdinal(ch);
14116 break;
14117 }
14118
14119 default:
14120 PyErr_Format(PyExc_ValueError,
14121 "unsupported format character '%c' (0x%x) "
14122 "at index %zd",
14123 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14124 (int)arg->ch,
14125 ctx->fmtpos - 1);
14126 return -1;
14127 }
14128 if (*p_str == NULL)
14129 return -1;
14130 assert (PyUnicode_Check(*p_str));
14131 return 0;
14132}
14133
14134static int
14135unicode_format_arg_output(struct unicode_formatter_t *ctx,
14136 struct unicode_format_arg_t *arg,
14137 PyObject *str)
14138{
14139 Py_ssize_t len;
14140 enum PyUnicode_Kind kind;
14141 void *pbuf;
14142 Py_ssize_t pindex;
14143 Py_UCS4 signchar;
14144 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014145 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014146 Py_ssize_t sublen;
14147 _PyUnicodeWriter *writer = &ctx->writer;
14148 Py_UCS4 fill;
14149
14150 fill = ' ';
14151 if (arg->sign && arg->flags & F_ZERO)
14152 fill = '0';
14153
14154 if (PyUnicode_READY(str) == -1)
14155 return -1;
14156
14157 len = PyUnicode_GET_LENGTH(str);
14158 if ((arg->width == -1 || arg->width <= len)
14159 && (arg->prec == -1 || arg->prec >= len)
14160 && !(arg->flags & (F_SIGN | F_BLANK)))
14161 {
14162 /* Fast path */
14163 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14164 return -1;
14165 return 0;
14166 }
14167
14168 /* Truncate the string for "s", "r" and "a" formats
14169 if the precision is set */
14170 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14171 if (arg->prec >= 0 && len > arg->prec)
14172 len = arg->prec;
14173 }
14174
14175 /* Adjust sign and width */
14176 kind = PyUnicode_KIND(str);
14177 pbuf = PyUnicode_DATA(str);
14178 pindex = 0;
14179 signchar = '\0';
14180 if (arg->sign) {
14181 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14182 if (ch == '-' || ch == '+') {
14183 signchar = ch;
14184 len--;
14185 pindex++;
14186 }
14187 else if (arg->flags & F_SIGN)
14188 signchar = '+';
14189 else if (arg->flags & F_BLANK)
14190 signchar = ' ';
14191 else
14192 arg->sign = 0;
14193 }
14194 if (arg->width < len)
14195 arg->width = len;
14196
14197 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014198 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014199 if (!(arg->flags & F_LJUST)) {
14200 if (arg->sign) {
14201 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014202 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014203 }
14204 else {
14205 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014206 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014207 }
14208 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014209 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14210 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014211 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014212 }
14213
Victor Stinnera47082312012-10-04 02:19:54 +020014214 buflen = arg->width;
14215 if (arg->sign && len == arg->width)
14216 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014217 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014218 return -1;
14219
14220 /* Write the sign if needed */
14221 if (arg->sign) {
14222 if (fill != ' ') {
14223 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14224 writer->pos += 1;
14225 }
14226 if (arg->width > len)
14227 arg->width--;
14228 }
14229
14230 /* Write the numeric prefix for "x", "X" and "o" formats
14231 if the alternate form is used.
14232 For example, write "0x" for the "%#x" format. */
14233 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14234 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14235 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14236 if (fill != ' ') {
14237 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14238 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14239 writer->pos += 2;
14240 pindex += 2;
14241 }
14242 arg->width -= 2;
14243 if (arg->width < 0)
14244 arg->width = 0;
14245 len -= 2;
14246 }
14247
14248 /* Pad left with the fill character if needed */
14249 if (arg->width > len && !(arg->flags & F_LJUST)) {
14250 sublen = arg->width - len;
14251 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14252 writer->pos += sublen;
14253 arg->width = len;
14254 }
14255
14256 /* If padding with spaces: write sign if needed and/or numeric prefix if
14257 the alternate form is used */
14258 if (fill == ' ') {
14259 if (arg->sign) {
14260 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14261 writer->pos += 1;
14262 }
14263 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14264 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14265 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14266 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14267 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14268 writer->pos += 2;
14269 pindex += 2;
14270 }
14271 }
14272
14273 /* Write characters */
14274 if (len) {
14275 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14276 str, pindex, len);
14277 writer->pos += len;
14278 }
14279
14280 /* Pad right with the fill character if needed */
14281 if (arg->width > len) {
14282 sublen = arg->width - len;
14283 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14284 writer->pos += sublen;
14285 }
14286 return 0;
14287}
14288
14289/* Helper of PyUnicode_Format(): format one arg.
14290 Return 0 on success, raise an exception and return -1 on error. */
14291static int
14292unicode_format_arg(struct unicode_formatter_t *ctx)
14293{
14294 struct unicode_format_arg_t arg;
14295 PyObject *str;
14296 int ret;
14297
Victor Stinner8dbd4212012-12-04 09:30:24 +010014298 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14299 arg.flags = 0;
14300 arg.width = -1;
14301 arg.prec = -1;
14302 arg.sign = 0;
14303 str = NULL;
14304
Victor Stinnera47082312012-10-04 02:19:54 +020014305 ret = unicode_format_arg_parse(ctx, &arg);
14306 if (ret == -1)
14307 return -1;
14308
14309 ret = unicode_format_arg_format(ctx, &arg, &str);
14310 if (ret == -1)
14311 return -1;
14312
14313 if (ret != 1) {
14314 ret = unicode_format_arg_output(ctx, &arg, str);
14315 Py_DECREF(str);
14316 if (ret == -1)
14317 return -1;
14318 }
14319
14320 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14321 PyErr_SetString(PyExc_TypeError,
14322 "not all arguments converted during string formatting");
14323 return -1;
14324 }
14325 return 0;
14326}
14327
Alexander Belopolsky40018472011-02-26 01:02:56 +000014328PyObject *
14329PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014330{
Victor Stinnera47082312012-10-04 02:19:54 +020014331 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014332
Guido van Rossumd57fd912000-03-10 22:53:23 +000014333 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014334 PyErr_BadInternalCall();
14335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014336 }
Victor Stinnera47082312012-10-04 02:19:54 +020014337
14338 ctx.fmtstr = PyUnicode_FromObject(format);
14339 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014340 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014341 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14342 Py_DECREF(ctx.fmtstr);
14343 return NULL;
14344 }
14345 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14346 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14347 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14348 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014349
Victor Stinner8f674cc2013-04-17 23:02:17 +020014350 _PyUnicodeWriter_Init(&ctx.writer);
14351 ctx.writer.min_length = ctx.fmtcnt + 100;
14352 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014353
Guido van Rossumd57fd912000-03-10 22:53:23 +000014354 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014355 ctx.arglen = PyTuple_Size(args);
14356 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014357 }
14358 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014359 ctx.arglen = -1;
14360 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014361 }
Victor Stinnera47082312012-10-04 02:19:54 +020014362 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014363 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014364 ctx.dict = args;
14365 else
14366 ctx.dict = NULL;
14367 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014368
Victor Stinnera47082312012-10-04 02:19:54 +020014369 while (--ctx.fmtcnt >= 0) {
14370 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014371 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014372
14373 nonfmtpos = ctx.fmtpos++;
14374 while (ctx.fmtcnt >= 0 &&
14375 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14376 ctx.fmtpos++;
14377 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014378 }
Victor Stinnera47082312012-10-04 02:19:54 +020014379 if (ctx.fmtcnt < 0) {
14380 ctx.fmtpos--;
14381 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014382 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014383
Victor Stinnercfc4c132013-04-03 01:48:39 +020014384 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14385 nonfmtpos, ctx.fmtpos) < 0)
14386 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014387 }
14388 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014389 ctx.fmtpos++;
14390 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014391 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014392 }
14393 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014394
Victor Stinnera47082312012-10-04 02:19:54 +020014395 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014396 PyErr_SetString(PyExc_TypeError,
14397 "not all arguments converted during string formatting");
14398 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014399 }
14400
Victor Stinnera47082312012-10-04 02:19:54 +020014401 if (ctx.args_owned) {
14402 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014403 }
Victor Stinnera47082312012-10-04 02:19:54 +020014404 Py_DECREF(ctx.fmtstr);
14405 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014406
Benjamin Peterson29060642009-01-31 22:14:21 +000014407 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014408 Py_DECREF(ctx.fmtstr);
14409 _PyUnicodeWriter_Dealloc(&ctx.writer);
14410 if (ctx.args_owned) {
14411 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014412 }
14413 return NULL;
14414}
14415
Jeremy Hylton938ace62002-07-17 16:30:39 +000014416static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014417unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14418
Tim Peters6d6c1a32001-08-02 04:15:00 +000014419static PyObject *
14420unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14421{
Benjamin Peterson29060642009-01-31 22:14:21 +000014422 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014423 static char *kwlist[] = {"object", "encoding", "errors", 0};
14424 char *encoding = NULL;
14425 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014426
Benjamin Peterson14339b62009-01-31 16:36:08 +000014427 if (type != &PyUnicode_Type)
14428 return unicode_subtype_new(type, args, kwds);
14429 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014430 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014431 return NULL;
14432 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014433 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014434 if (encoding == NULL && errors == NULL)
14435 return PyObject_Str(x);
14436 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014437 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014438}
14439
Guido van Rossume023fe02001-08-30 03:12:59 +000014440static PyObject *
14441unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14442{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014443 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014444 Py_ssize_t length, char_size;
14445 int share_wstr, share_utf8;
14446 unsigned int kind;
14447 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014448
Benjamin Peterson14339b62009-01-31 16:36:08 +000014449 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014450
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014451 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014452 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014453 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014454 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014455 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014456 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014457 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014458 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014459
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014460 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014461 if (self == NULL) {
14462 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014463 return NULL;
14464 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014465 kind = PyUnicode_KIND(unicode);
14466 length = PyUnicode_GET_LENGTH(unicode);
14467
14468 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014469#ifdef Py_DEBUG
14470 _PyUnicode_HASH(self) = -1;
14471#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014472 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014473#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014474 _PyUnicode_STATE(self).interned = 0;
14475 _PyUnicode_STATE(self).kind = kind;
14476 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014477 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014478 _PyUnicode_STATE(self).ready = 1;
14479 _PyUnicode_WSTR(self) = NULL;
14480 _PyUnicode_UTF8_LENGTH(self) = 0;
14481 _PyUnicode_UTF8(self) = NULL;
14482 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014483 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014484
14485 share_utf8 = 0;
14486 share_wstr = 0;
14487 if (kind == PyUnicode_1BYTE_KIND) {
14488 char_size = 1;
14489 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14490 share_utf8 = 1;
14491 }
14492 else if (kind == PyUnicode_2BYTE_KIND) {
14493 char_size = 2;
14494 if (sizeof(wchar_t) == 2)
14495 share_wstr = 1;
14496 }
14497 else {
14498 assert(kind == PyUnicode_4BYTE_KIND);
14499 char_size = 4;
14500 if (sizeof(wchar_t) == 4)
14501 share_wstr = 1;
14502 }
14503
14504 /* Ensure we won't overflow the length. */
14505 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14506 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014507 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014508 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014509 data = PyObject_MALLOC((length + 1) * char_size);
14510 if (data == NULL) {
14511 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014512 goto onError;
14513 }
14514
Victor Stinnerc3c74152011-10-02 20:39:55 +020014515 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014516 if (share_utf8) {
14517 _PyUnicode_UTF8_LENGTH(self) = length;
14518 _PyUnicode_UTF8(self) = data;
14519 }
14520 if (share_wstr) {
14521 _PyUnicode_WSTR_LENGTH(self) = length;
14522 _PyUnicode_WSTR(self) = (wchar_t *)data;
14523 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014524
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014525 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014526 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014527 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014528#ifdef Py_DEBUG
14529 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14530#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014531 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014532 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014533
14534onError:
14535 Py_DECREF(unicode);
14536 Py_DECREF(self);
14537 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014538}
14539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014540PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014541"str(object='') -> str\n\
14542str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014543\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014544Create a new string object from the given object. If encoding or\n\
14545errors is specified, then the object must expose a data buffer\n\
14546that will be decoded using the given encoding and error handler.\n\
14547Otherwise, returns the result of object.__str__() (if defined)\n\
14548or repr(object).\n\
14549encoding defaults to sys.getdefaultencoding().\n\
14550errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014551
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014552static PyObject *unicode_iter(PyObject *seq);
14553
Guido van Rossumd57fd912000-03-10 22:53:23 +000014554PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014555 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014556 "str", /* tp_name */
14557 sizeof(PyUnicodeObject), /* tp_size */
14558 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014559 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014560 (destructor)unicode_dealloc, /* tp_dealloc */
14561 0, /* tp_print */
14562 0, /* tp_getattr */
14563 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014564 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014565 unicode_repr, /* tp_repr */
14566 &unicode_as_number, /* tp_as_number */
14567 &unicode_as_sequence, /* tp_as_sequence */
14568 &unicode_as_mapping, /* tp_as_mapping */
14569 (hashfunc) unicode_hash, /* tp_hash*/
14570 0, /* tp_call*/
14571 (reprfunc) unicode_str, /* tp_str */
14572 PyObject_GenericGetAttr, /* tp_getattro */
14573 0, /* tp_setattro */
14574 0, /* tp_as_buffer */
14575 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014576 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014577 unicode_doc, /* tp_doc */
14578 0, /* tp_traverse */
14579 0, /* tp_clear */
14580 PyUnicode_RichCompare, /* tp_richcompare */
14581 0, /* tp_weaklistoffset */
14582 unicode_iter, /* tp_iter */
14583 0, /* tp_iternext */
14584 unicode_methods, /* tp_methods */
14585 0, /* tp_members */
14586 0, /* tp_getset */
14587 &PyBaseObject_Type, /* tp_base */
14588 0, /* tp_dict */
14589 0, /* tp_descr_get */
14590 0, /* tp_descr_set */
14591 0, /* tp_dictoffset */
14592 0, /* tp_init */
14593 0, /* tp_alloc */
14594 unicode_new, /* tp_new */
14595 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014596};
14597
14598/* Initialize the Unicode implementation */
14599
Victor Stinner3a50e702011-10-18 21:21:00 +020014600int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014601{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014602 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014603 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014604 0x000A, /* LINE FEED */
14605 0x000D, /* CARRIAGE RETURN */
14606 0x001C, /* FILE SEPARATOR */
14607 0x001D, /* GROUP SEPARATOR */
14608 0x001E, /* RECORD SEPARATOR */
14609 0x0085, /* NEXT LINE */
14610 0x2028, /* LINE SEPARATOR */
14611 0x2029, /* PARAGRAPH SEPARATOR */
14612 };
14613
Fred Drakee4315f52000-05-09 19:53:39 +000014614 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014615 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014616 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014617 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014618 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014619
Guido van Rossumcacfc072002-05-24 19:01:59 +000014620 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014621 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014622
14623 /* initialize the linebreak bloom filter */
14624 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014625 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014626 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014627
Christian Heimes26532f72013-07-20 14:57:16 +020014628 if (PyType_Ready(&EncodingMapType) < 0)
14629 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014630
Benjamin Petersonc4311282012-10-30 23:21:10 -040014631 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14632 Py_FatalError("Can't initialize field name iterator type");
14633
14634 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14635 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014636
Victor Stinner3a50e702011-10-18 21:21:00 +020014637#ifdef HAVE_MBCS
14638 winver.dwOSVersionInfoSize = sizeof(winver);
14639 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14640 PyErr_SetFromWindowsErr(0);
14641 return -1;
14642 }
14643#endif
14644 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014645}
14646
14647/* Finalize the Unicode implementation */
14648
Christian Heimesa156e092008-02-16 07:38:31 +000014649int
14650PyUnicode_ClearFreeList(void)
14651{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014652 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014653}
14654
Guido van Rossumd57fd912000-03-10 22:53:23 +000014655void
Thomas Wouters78890102000-07-22 19:25:51 +000014656_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014657{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014658 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014659
Serhiy Storchaka05997252013-01-26 12:14:02 +020014660 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014661
Serhiy Storchaka05997252013-01-26 12:14:02 +020014662 for (i = 0; i < 256; i++)
14663 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014664 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014665 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014666}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014667
Walter Dörwald16807132007-05-25 13:52:07 +000014668void
14669PyUnicode_InternInPlace(PyObject **p)
14670{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014671 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014672 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014673#ifdef Py_DEBUG
14674 assert(s != NULL);
14675 assert(_PyUnicode_CHECK(s));
14676#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014677 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014678 return;
14679#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014680 /* If it's a subclass, we don't really know what putting
14681 it in the interned dict might do. */
14682 if (!PyUnicode_CheckExact(s))
14683 return;
14684 if (PyUnicode_CHECK_INTERNED(s))
14685 return;
14686 if (interned == NULL) {
14687 interned = PyDict_New();
14688 if (interned == NULL) {
14689 PyErr_Clear(); /* Don't leave an exception */
14690 return;
14691 }
14692 }
14693 /* It might be that the GetItem call fails even
14694 though the key is present in the dictionary,
14695 namely when this happens during a stack overflow. */
14696 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014697 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014698 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014699
Victor Stinnerf0335102013-04-14 19:13:03 +020014700 if (t) {
14701 Py_INCREF(t);
14702 Py_DECREF(*p);
14703 *p = t;
14704 return;
14705 }
Walter Dörwald16807132007-05-25 13:52:07 +000014706
Benjamin Peterson14339b62009-01-31 16:36:08 +000014707 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014708 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014709 PyErr_Clear();
14710 PyThreadState_GET()->recursion_critical = 0;
14711 return;
14712 }
14713 PyThreadState_GET()->recursion_critical = 0;
14714 /* The two references in interned are not counted by refcnt.
14715 The deallocator will take care of this */
14716 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014717 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014718}
14719
14720void
14721PyUnicode_InternImmortal(PyObject **p)
14722{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014723 PyUnicode_InternInPlace(p);
14724 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014725 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014726 Py_INCREF(*p);
14727 }
Walter Dörwald16807132007-05-25 13:52:07 +000014728}
14729
14730PyObject *
14731PyUnicode_InternFromString(const char *cp)
14732{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014733 PyObject *s = PyUnicode_FromString(cp);
14734 if (s == NULL)
14735 return NULL;
14736 PyUnicode_InternInPlace(&s);
14737 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014738}
14739
Alexander Belopolsky40018472011-02-26 01:02:56 +000014740void
14741_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014742{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014743 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014744 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014745 Py_ssize_t i, n;
14746 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014747
Benjamin Peterson14339b62009-01-31 16:36:08 +000014748 if (interned == NULL || !PyDict_Check(interned))
14749 return;
14750 keys = PyDict_Keys(interned);
14751 if (keys == NULL || !PyList_Check(keys)) {
14752 PyErr_Clear();
14753 return;
14754 }
Walter Dörwald16807132007-05-25 13:52:07 +000014755
Benjamin Peterson14339b62009-01-31 16:36:08 +000014756 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14757 detector, interned unicode strings are not forcibly deallocated;
14758 rather, we give them their stolen references back, and then clear
14759 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014760
Benjamin Peterson14339b62009-01-31 16:36:08 +000014761 n = PyList_GET_SIZE(keys);
14762 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014763 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014764 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014765 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014766 if (PyUnicode_READY(s) == -1) {
14767 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014768 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014770 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014771 case SSTATE_NOT_INTERNED:
14772 /* XXX Shouldn't happen */
14773 break;
14774 case SSTATE_INTERNED_IMMORTAL:
14775 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014776 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014777 break;
14778 case SSTATE_INTERNED_MORTAL:
14779 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014780 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014781 break;
14782 default:
14783 Py_FatalError("Inconsistent interned string state.");
14784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014785 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014786 }
14787 fprintf(stderr, "total size of all interned strings: "
14788 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14789 "mortal/immortal\n", mortal_size, immortal_size);
14790 Py_DECREF(keys);
14791 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014792 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014793}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014794
14795
14796/********************* Unicode Iterator **************************/
14797
14798typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014799 PyObject_HEAD
14800 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014801 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014802} unicodeiterobject;
14803
14804static void
14805unicodeiter_dealloc(unicodeiterobject *it)
14806{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014807 _PyObject_GC_UNTRACK(it);
14808 Py_XDECREF(it->it_seq);
14809 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014810}
14811
14812static int
14813unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14814{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014815 Py_VISIT(it->it_seq);
14816 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014817}
14818
14819static PyObject *
14820unicodeiter_next(unicodeiterobject *it)
14821{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014822 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014823
Benjamin Peterson14339b62009-01-31 16:36:08 +000014824 assert(it != NULL);
14825 seq = it->it_seq;
14826 if (seq == NULL)
14827 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014828 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014830 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14831 int kind = PyUnicode_KIND(seq);
14832 void *data = PyUnicode_DATA(seq);
14833 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14834 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014835 if (item != NULL)
14836 ++it->it_index;
14837 return item;
14838 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014839
Benjamin Peterson14339b62009-01-31 16:36:08 +000014840 Py_DECREF(seq);
14841 it->it_seq = NULL;
14842 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014843}
14844
14845static PyObject *
14846unicodeiter_len(unicodeiterobject *it)
14847{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014848 Py_ssize_t len = 0;
14849 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014850 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014851 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014852}
14853
14854PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14855
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014856static PyObject *
14857unicodeiter_reduce(unicodeiterobject *it)
14858{
14859 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014860 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014861 it->it_seq, it->it_index);
14862 } else {
14863 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14864 if (u == NULL)
14865 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014866 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014867 }
14868}
14869
14870PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14871
14872static PyObject *
14873unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14874{
14875 Py_ssize_t index = PyLong_AsSsize_t(state);
14876 if (index == -1 && PyErr_Occurred())
14877 return NULL;
14878 if (index < 0)
14879 index = 0;
14880 it->it_index = index;
14881 Py_RETURN_NONE;
14882}
14883
14884PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14885
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014886static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014887 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014888 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014889 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14890 reduce_doc},
14891 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14892 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014893 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014894};
14895
14896PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014897 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14898 "str_iterator", /* tp_name */
14899 sizeof(unicodeiterobject), /* tp_basicsize */
14900 0, /* tp_itemsize */
14901 /* methods */
14902 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14903 0, /* tp_print */
14904 0, /* tp_getattr */
14905 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014906 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014907 0, /* tp_repr */
14908 0, /* tp_as_number */
14909 0, /* tp_as_sequence */
14910 0, /* tp_as_mapping */
14911 0, /* tp_hash */
14912 0, /* tp_call */
14913 0, /* tp_str */
14914 PyObject_GenericGetAttr, /* tp_getattro */
14915 0, /* tp_setattro */
14916 0, /* tp_as_buffer */
14917 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14918 0, /* tp_doc */
14919 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14920 0, /* tp_clear */
14921 0, /* tp_richcompare */
14922 0, /* tp_weaklistoffset */
14923 PyObject_SelfIter, /* tp_iter */
14924 (iternextfunc)unicodeiter_next, /* tp_iternext */
14925 unicodeiter_methods, /* tp_methods */
14926 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014927};
14928
14929static PyObject *
14930unicode_iter(PyObject *seq)
14931{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014932 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014933
Benjamin Peterson14339b62009-01-31 16:36:08 +000014934 if (!PyUnicode_Check(seq)) {
14935 PyErr_BadInternalCall();
14936 return NULL;
14937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014938 if (PyUnicode_READY(seq) == -1)
14939 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014940 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14941 if (it == NULL)
14942 return NULL;
14943 it->it_index = 0;
14944 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014945 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014946 _PyObject_GC_TRACK(it);
14947 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014948}
14949
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014950
14951size_t
14952Py_UNICODE_strlen(const Py_UNICODE *u)
14953{
14954 int res = 0;
14955 while(*u++)
14956 res++;
14957 return res;
14958}
14959
14960Py_UNICODE*
14961Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14962{
14963 Py_UNICODE *u = s1;
14964 while ((*u++ = *s2++));
14965 return s1;
14966}
14967
14968Py_UNICODE*
14969Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14970{
14971 Py_UNICODE *u = s1;
14972 while ((*u++ = *s2++))
14973 if (n-- == 0)
14974 break;
14975 return s1;
14976}
14977
14978Py_UNICODE*
14979Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14980{
14981 Py_UNICODE *u1 = s1;
14982 u1 += Py_UNICODE_strlen(u1);
14983 Py_UNICODE_strcpy(u1, s2);
14984 return s1;
14985}
14986
14987int
14988Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14989{
14990 while (*s1 && *s2 && *s1 == *s2)
14991 s1++, s2++;
14992 if (*s1 && *s2)
14993 return (*s1 < *s2) ? -1 : +1;
14994 if (*s1)
14995 return 1;
14996 if (*s2)
14997 return -1;
14998 return 0;
14999}
15000
15001int
15002Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15003{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015004 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015005 for (; n != 0; n--) {
15006 u1 = *s1;
15007 u2 = *s2;
15008 if (u1 != u2)
15009 return (u1 < u2) ? -1 : +1;
15010 if (u1 == '\0')
15011 return 0;
15012 s1++;
15013 s2++;
15014 }
15015 return 0;
15016}
15017
15018Py_UNICODE*
15019Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15020{
15021 const Py_UNICODE *p;
15022 for (p = s; *p; p++)
15023 if (*p == c)
15024 return (Py_UNICODE*)p;
15025 return NULL;
15026}
15027
15028Py_UNICODE*
15029Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15030{
15031 const Py_UNICODE *p;
15032 p = s + Py_UNICODE_strlen(s);
15033 while (p != s) {
15034 p--;
15035 if (*p == c)
15036 return (Py_UNICODE*)p;
15037 }
15038 return NULL;
15039}
Victor Stinner331ea922010-08-10 16:37:20 +000015040
Victor Stinner71133ff2010-09-01 23:43:53 +000015041Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015042PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015043{
Victor Stinner577db2c2011-10-11 22:12:48 +020015044 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015045 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015047 if (!PyUnicode_Check(unicode)) {
15048 PyErr_BadArgument();
15049 return NULL;
15050 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015051 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015052 if (u == NULL)
15053 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015054 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015055 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015056 PyErr_NoMemory();
15057 return NULL;
15058 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015059 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015060 size *= sizeof(Py_UNICODE);
15061 copy = PyMem_Malloc(size);
15062 if (copy == NULL) {
15063 PyErr_NoMemory();
15064 return NULL;
15065 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015066 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015067 return copy;
15068}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015069
Georg Brandl66c221e2010-10-14 07:04:07 +000015070/* A _string module, to export formatter_parser and formatter_field_name_split
15071 to the string.Formatter class implemented in Python. */
15072
15073static PyMethodDef _string_methods[] = {
15074 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15075 METH_O, PyDoc_STR("split the argument as a field name")},
15076 {"formatter_parser", (PyCFunction) formatter_parser,
15077 METH_O, PyDoc_STR("parse the argument as a format string")},
15078 {NULL, NULL}
15079};
15080
15081static struct PyModuleDef _string_module = {
15082 PyModuleDef_HEAD_INIT,
15083 "_string",
15084 PyDoc_STR("string helper module"),
15085 0,
15086 _string_methods,
15087 NULL,
15088 NULL,
15089 NULL,
15090 NULL
15091};
15092
15093PyMODINIT_FUNC
15094PyInit__string(void)
15095{
15096 return PyModule_Create(&_string_module);
15097}
15098
15099
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015100#ifdef __cplusplus
15101}
15102#endif