blob: e6195fe53b886377c1f2b28abe1b2239a9d5995a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinner910337b2011-10-03 03:20:16 +0200107#undef PyUnicode_READY
108#define PyUnicode_READY(op) \
109 (assert(_PyUnicode_CHECK(op)), \
110 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200111 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100112 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200113
Victor Stinnerc379ead2011-10-03 12:52:27 +0200114#define _PyUnicode_SHARE_UTF8(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
117 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
118#define _PyUnicode_SHARE_WSTR(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
121
Victor Stinner829c0ad2011-10-03 01:08:02 +0200122/* true if the Unicode object has an allocated UTF-8 memory block
123 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200125 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200126 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
128
Victor Stinner03490912011-10-03 23:45:12 +0200129/* true if the Unicode object has an allocated wstr memory block
130 (not shared with other data) */
131#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200132 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200133 (!PyUnicode_IS_READY(op) || \
134 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
135
Victor Stinner910337b2011-10-03 03:20:16 +0200136/* Generic helper macro to convert characters of different types.
137 from_type and to_type have to be valid type names, begin and end
138 are pointers to the source characters which should be of type
139 "from_type *". to is a pointer of type "to_type *" and points to the
140 buffer where the result characters are written to. */
141#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
142 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200143 to_type *_to = (to_type *) to; \
144 const from_type *_iter = (begin); \
145 const from_type *_end = (end); \
146 Py_ssize_t n = (_end) - (_iter); \
147 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200148 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 while (_iter < (_unrolled_end)) { \
150 _to[0] = (to_type) _iter[0]; \
151 _to[1] = (to_type) _iter[1]; \
152 _to[2] = (to_type) _iter[2]; \
153 _to[3] = (to_type) _iter[3]; \
154 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200155 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_end)) \
157 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200158 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159
Walter Dörwald16807132007-05-25 13:52:07 +0000160/* This dictionary holds all interned unicode strings. Note that references
161 to strings in this dictionary are *not* counted in the string's ob_refcnt.
162 When the interned string reaches a refcnt of 0 the string deallocation
163 function will delete the reference from this dictionary.
164
165 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000166 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000167*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200168static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000169
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000170/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200171static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200172
Serhiy Storchaka678db842013-01-26 12:16:36 +0200173#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200174 do { \
175 if (unicode_empty != NULL) \
176 Py_INCREF(unicode_empty); \
177 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178 unicode_empty = PyUnicode_New(0, 0); \
179 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200180 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
182 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186#define _Py_RETURN_UNICODE_EMPTY() \
187 do { \
188 _Py_INCREF_UNICODE_EMPTY(); \
189 return unicode_empty; \
190 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200192/* Forward declaration */
193Py_LOCAL_INLINE(int)
194_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
195
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200196/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200197static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* Single character Unicode strings in the Latin-1 range are being
200 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202
Christian Heimes190d79e2008-01-30 11:58:22 +0000203/* Fast detection of the most frequent whitespace characters */
204const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000206/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000208/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x000C: * FORM FEED */
210/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 1, 1, 1, 1, 1, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x001C: * FILE SEPARATOR */
214/* case 0x001D: * GROUP SEPARATOR */
215/* case 0x001E: * RECORD SEPARATOR */
216/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 1, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000223
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000232};
233
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200234/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200235static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200236static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100237static int unicode_modifiable(PyObject *unicode);
238
Victor Stinnerfe226c02011-10-03 03:52:20 +0200239
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100241_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200242static PyObject *
243_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
244static PyObject *
245_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
246
247static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000248unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000249 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100250 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000251 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
252
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253static void
254raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300255 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100256 PyObject *unicode,
257 Py_ssize_t startpos, Py_ssize_t endpos,
258 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000259
Christian Heimes190d79e2008-01-30 11:58:22 +0000260/* Same for linebreaks */
261static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000262 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264/* 0x000B, * LINE TABULATION */
265/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x001C, * FILE SEPARATOR */
270/* 0x001D, * GROUP SEPARATOR */
271/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000272 0, 0, 0, 0, 1, 1, 1, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000277
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000286};
287
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300288/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
289 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000291PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000293#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 /* This is actually an illegal character, so it should
297 not be passed to unichr. */
298 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299#endif
300}
301
Victor Stinner910337b2011-10-03 03:20:16 +0200302#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200303int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100304_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200305{
306 PyASCIIObject *ascii;
307 unsigned int kind;
308
309 assert(PyUnicode_Check(op));
310
311 ascii = (PyASCIIObject *)op;
312 kind = ascii->state.kind;
313
Victor Stinnera3b334d2011-10-03 13:53:37 +0200314 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(ascii->state.ready == 1);
317 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200319 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200320 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200321
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 if (ascii->state.compact == 1) {
323 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200324 assert(kind == PyUnicode_1BYTE_KIND
325 || kind == PyUnicode_2BYTE_KIND
326 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 }
331 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
333
334 data = unicode->data.any;
335 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 assert(ascii->length == 0);
337 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.compact == 0);
339 assert(ascii->state.ascii == 0);
340 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 }
346 else {
347 assert(kind == PyUnicode_1BYTE_KIND
348 || kind == PyUnicode_2BYTE_KIND
349 || kind == PyUnicode_4BYTE_KIND);
350 assert(ascii->state.compact == 0);
351 assert(ascii->state.ready == 1);
352 assert(data != NULL);
353 if (ascii->state.ascii) {
354 assert (compact->utf8 == data);
355 assert (compact->utf8_length == ascii->length);
356 }
357 else
358 assert (compact->utf8 != data);
359 }
360 }
361 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200362 if (
363#if SIZEOF_WCHAR_T == 2
364 kind == PyUnicode_2BYTE_KIND
365#else
366 kind == PyUnicode_4BYTE_KIND
367#endif
368 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200369 {
370 assert(ascii->wstr == data);
371 assert(compact->wstr_length == ascii->length);
372 } else
373 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200375
376 if (compact->utf8 == NULL)
377 assert(compact->utf8_length == 0);
378 if (ascii->wstr == NULL)
379 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 /* check that the best kind is used */
382 if (check_content && kind != PyUnicode_WCHAR_KIND)
383 {
384 Py_ssize_t i;
385 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200386 void *data;
387 Py_UCS4 ch;
388
389 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 for (i=0; i < ascii->length; i++)
391 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 if (ch > maxchar)
394 maxchar = ch;
395 }
396 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100397 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 assert(maxchar <= 255);
400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 else
402 assert(maxchar < 128);
403 }
Victor Stinner77faf692011-11-20 18:56:05 +0100404 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 0xFFFF);
407 }
408 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100410 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200412 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200413 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400414 return 1;
415}
Victor Stinner910337b2011-10-03 03:20:16 +0200416#endif
417
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100418static PyObject*
419unicode_result_wchar(PyObject *unicode)
420{
421#ifndef Py_DEBUG
422 Py_ssize_t len;
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100426 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200427 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100432 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200440 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 return NULL;
442 }
443#else
Victor Stinneraa771272012-10-04 02:32:58 +0200444 assert(Py_REFCNT(unicode) == 1);
445
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 /* don't make the result ready in debug mode to ensure that the caller
447 makes the string ready before using it */
448 assert(_PyUnicode_CheckConsistency(unicode, 1));
449#endif
450 return unicode;
451}
452
453static PyObject*
454unicode_result_ready(PyObject *unicode)
455{
456 Py_ssize_t length;
457
458 length = PyUnicode_GET_LENGTH(unicode);
459 if (length == 0) {
460 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100461 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200462 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100463 }
464 return unicode_empty;
465 }
466
467 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200468 void *data = PyUnicode_DATA(unicode);
469 int kind = PyUnicode_KIND(unicode);
470 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100471 if (ch < 256) {
472 PyObject *latin1_char = unicode_latin1[ch];
473 if (latin1_char != NULL) {
474 if (unicode != latin1_char) {
475 Py_INCREF(latin1_char);
476 Py_DECREF(unicode);
477 }
478 return latin1_char;
479 }
480 else {
481 assert(_PyUnicode_CheckConsistency(unicode, 1));
482 Py_INCREF(unicode);
483 unicode_latin1[ch] = unicode;
484 return unicode;
485 }
486 }
487 }
488
489 assert(_PyUnicode_CheckConsistency(unicode, 1));
490 return unicode;
491}
492
493static PyObject*
494unicode_result(PyObject *unicode)
495{
496 assert(_PyUnicode_CHECK(unicode));
497 if (PyUnicode_IS_READY(unicode))
498 return unicode_result_ready(unicode);
499 else
500 return unicode_result_wchar(unicode);
501}
502
Victor Stinnerc4b49542011-12-11 22:44:26 +0100503static PyObject*
504unicode_result_unchanged(PyObject *unicode)
505{
506 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500507 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508 return NULL;
509 Py_INCREF(unicode);
510 return unicode;
511 }
512 else
513 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100514 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515}
516
Victor Stinner3a50e702011-10-18 21:21:00 +0200517#ifdef HAVE_MBCS
518static OSVERSIONINFOEX winver;
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521/* --- Bloom Filters ----------------------------------------------------- */
522
523/* stuff to implement simple "bloom filters" for Unicode characters.
524 to keep things simple, we use a single bitmask, using the least 5
525 bits from each unicode characters as the bit index. */
526
527/* the linebreak mask is set up by Unicode_Init below */
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#if LONG_BIT >= 128
530#define BLOOM_WIDTH 128
531#elif LONG_BIT >= 64
532#define BLOOM_WIDTH 64
533#elif LONG_BIT >= 32
534#define BLOOM_WIDTH 32
535#else
536#error "LONG_BIT is smaller than 32"
537#endif
538
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539#define BLOOM_MASK unsigned long
540
Serhiy Storchaka05997252013-01-26 12:14:02 +0200541static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542
Antoine Pitrouf068f942010-01-13 14:19:12 +0000543#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
Benjamin Peterson29060642009-01-31 22:14:21 +0000545#define BLOOM_LINEBREAK(ch) \
546 ((ch) < 128U ? ascii_linebreak[(ch)] : \
547 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
Alexander Belopolsky40018472011-02-26 01:02:56 +0000549Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551{
Victor Stinnera85af502013-04-09 21:53:54 +0200552#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
553 do { \
554 TYPE *data = (TYPE *)PTR; \
555 TYPE *end = data + LEN; \
556 Py_UCS4 ch; \
557 for (; data != end; data++) { \
558 ch = *data; \
559 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
560 } \
561 break; \
562 } while (0)
563
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564 /* calculate simple bloom-style bitmask for a given unicode string */
565
Antoine Pitrouf068f942010-01-13 14:19:12 +0000566 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567
568 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200569 switch (kind) {
570 case PyUnicode_1BYTE_KIND:
571 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
572 break;
573 case PyUnicode_2BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
575 break;
576 case PyUnicode_4BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
578 break;
579 default:
580 assert(0);
581 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000582 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200583
584#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585}
586
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200587/* Compilation of templated routines */
588
589#include "stringlib/asciilib.h"
590#include "stringlib/fastsearch.h"
591#include "stringlib/partition.h"
592#include "stringlib/split.h"
593#include "stringlib/count.h"
594#include "stringlib/find.h"
595#include "stringlib/find_max_char.h"
596#include "stringlib/localeutil.h"
597#include "stringlib/undef.h"
598
599#include "stringlib/ucs1lib.h"
600#include "stringlib/fastsearch.h"
601#include "stringlib/partition.h"
602#include "stringlib/split.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300605#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs2lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300616#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200617#include "stringlib/find_max_char.h"
618#include "stringlib/localeutil.h"
619#include "stringlib/undef.h"
620
621#include "stringlib/ucs4lib.h"
622#include "stringlib/fastsearch.h"
623#include "stringlib/partition.h"
624#include "stringlib/split.h"
625#include "stringlib/count.h"
626#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300627#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200628#include "stringlib/find_max_char.h"
629#include "stringlib/localeutil.h"
630#include "stringlib/undef.h"
631
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200632#include "stringlib/unicodedefs.h"
633#include "stringlib/fastsearch.h"
634#include "stringlib/count.h"
635#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100636#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638/* --- Unicode Object ----------------------------------------------------- */
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200641fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200643Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
644 Py_ssize_t size, Py_UCS4 ch,
645 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
648
649 switch (kind) {
650 case PyUnicode_1BYTE_KIND:
651 {
652 Py_UCS1 ch1 = (Py_UCS1) ch;
653 if (ch1 == ch)
654 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
655 else
656 return -1;
657 }
658 case PyUnicode_2BYTE_KIND:
659 {
660 Py_UCS2 ch2 = (Py_UCS2) ch;
661 if (ch2 == ch)
662 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
663 else
664 return -1;
665 }
666 case PyUnicode_4BYTE_KIND:
667 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
668 default:
669 assert(0);
670 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672}
673
Victor Stinnerafffce42012-10-03 23:03:17 +0200674#ifdef Py_DEBUG
675/* Fill the data of an Unicode string with invalid characters to detect bugs
676 earlier.
677
678 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
679 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
680 invalid character in Unicode 6.0. */
681static void
682unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
683{
684 int kind = PyUnicode_KIND(unicode);
685 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
686 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
687 if (length <= old_length)
688 return;
689 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
690}
691#endif
692
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693static PyObject*
694resize_compact(PyObject *unicode, Py_ssize_t length)
695{
696 Py_ssize_t char_size;
697 Py_ssize_t struct_size;
698 Py_ssize_t new_size;
699 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100700 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
703#endif
704
Victor Stinner79891572012-05-03 13:43:07 +0200705 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100707 assert(PyUnicode_IS_COMPACT(unicode));
708
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200709 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100710 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 struct_size = sizeof(PyASCIIObject);
712 else
713 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
717 PyErr_NoMemory();
718 return NULL;
719 }
720 new_size = (struct_size + (length + 1) * char_size);
721
Victor Stinner84def372011-12-11 20:04:56 +0100722 _Py_DEC_REFTOTAL;
723 _Py_ForgetReference(unicode);
724
725 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
726 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100727 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200728 PyErr_NoMemory();
729 return NULL;
730 }
Victor Stinner84def372011-12-11 20:04:56 +0100731 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100733
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100737 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 _PyUnicode_WSTR_LENGTH(unicode) = length;
739 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100740 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
741 PyObject_DEL(_PyUnicode_WSTR(unicode));
742 _PyUnicode_WSTR(unicode) = NULL;
743 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200744#ifdef Py_DEBUG
745 unicode_fill_invalid(unicode, old_length);
746#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
748 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 return unicode;
751}
752
Alexander Belopolsky40018472011-02-26 01:02:56 +0000753static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200754resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755{
Victor Stinner95663112011-10-04 01:03:50 +0200756 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000760
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 if (PyUnicode_IS_READY(unicode)) {
762 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200765#ifdef Py_DEBUG
766 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
767#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200770 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200771 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
772 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
775 PyErr_NoMemory();
776 return -1;
777 }
778 new_size = (length + 1) * char_size;
779
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
781 {
782 PyObject_DEL(_PyUnicode_UTF8(unicode));
783 _PyUnicode_UTF8(unicode) = NULL;
784 _PyUnicode_UTF8_LENGTH(unicode) = 0;
785 }
786
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 data = (PyObject *)PyObject_REALLOC(data, new_size);
788 if (data == NULL) {
789 PyErr_NoMemory();
790 return -1;
791 }
792 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200793 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200795 _PyUnicode_WSTR_LENGTH(unicode) = length;
796 }
797 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200798 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200799 _PyUnicode_UTF8_LENGTH(unicode) = length;
800 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200801 _PyUnicode_LENGTH(unicode) = length;
802 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200803#ifdef Py_DEBUG
804 unicode_fill_invalid(unicode, old_length);
805#endif
Victor Stinner95663112011-10-04 01:03:50 +0200806 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200807 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200809 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200810 }
Victor Stinner95663112011-10-04 01:03:50 +0200811 assert(_PyUnicode_WSTR(unicode) != NULL);
812
813 /* check for integer overflow */
814 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
815 PyErr_NoMemory();
816 return -1;
817 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100818 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200819 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100820 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200821 if (!wstr) {
822 PyErr_NoMemory();
823 return -1;
824 }
825 _PyUnicode_WSTR(unicode) = wstr;
826 _PyUnicode_WSTR(unicode)[length] = 0;
827 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200828 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 return 0;
830}
831
Victor Stinnerfe226c02011-10-03 03:52:20 +0200832static PyObject*
833resize_copy(PyObject *unicode, Py_ssize_t length)
834{
835 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100836 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100838
Benjamin Petersonbac79492012-01-14 13:34:47 -0500839 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100840 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200841
842 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
843 if (copy == NULL)
844 return NULL;
845
846 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200847 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200848 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200849 }
850 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200851 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100852
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200853 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200854 if (w == NULL)
855 return NULL;
856 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
857 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200858 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
859 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200860 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200861 }
862}
863
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000865 Ux0000 terminated; some code (e.g. new_identifier)
866 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
868 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000869 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871*/
872
Alexander Belopolsky40018472011-02-26 01:02:56 +0000873static PyUnicodeObject *
874_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200876 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880 if (length == 0 && unicode_empty != NULL) {
881 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200882 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 }
884
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000885 /* Ensure we won't overflow the size. */
886 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
887 return (PyUnicodeObject *)PyErr_NoMemory();
888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 if (length < 0) {
890 PyErr_SetString(PyExc_SystemError,
891 "Negative size passed to _PyUnicode_New");
892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
896 if (unicode == NULL)
897 return NULL;
898 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100899
900 _PyUnicode_WSTR_LENGTH(unicode) = length;
901 _PyUnicode_HASH(unicode) = -1;
902 _PyUnicode_STATE(unicode).interned = 0;
903 _PyUnicode_STATE(unicode).kind = 0;
904 _PyUnicode_STATE(unicode).compact = 0;
905 _PyUnicode_STATE(unicode).ready = 0;
906 _PyUnicode_STATE(unicode).ascii = 0;
907 _PyUnicode_DATA_ANY(unicode) = NULL;
908 _PyUnicode_LENGTH(unicode) = 0;
909 _PyUnicode_UTF8(unicode) = NULL;
910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
913 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100914 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000915 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100916 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918
Jeremy Hyltond8082792003-09-16 19:41:39 +0000919 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000920 * the caller fails before initializing str -- unicode_resize()
921 * reads str[0], and the Keep-Alive optimization can keep memory
922 * allocated for str alive across a call to unicode_dealloc(unicode).
923 * We don't want unicode_resize to read uninitialized memory in
924 * that case.
925 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926 _PyUnicode_WSTR(unicode)[0] = 0;
927 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100928
Victor Stinner7931d9a2011-11-04 00:22:48 +0100929 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000930 return unicode;
931}
932
Victor Stinnerf42dc442011-10-02 23:33:16 +0200933static const char*
934unicode_kind_name(PyObject *unicode)
935{
Victor Stinner42dfd712011-10-03 14:41:45 +0200936 /* don't check consistency: unicode_kind_name() is called from
937 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938 if (!PyUnicode_IS_COMPACT(unicode))
939 {
940 if (!PyUnicode_IS_READY(unicode))
941 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600942 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 {
944 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200945 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946 return "legacy ascii";
947 else
948 return "legacy latin1";
949 case PyUnicode_2BYTE_KIND:
950 return "legacy UCS2";
951 case PyUnicode_4BYTE_KIND:
952 return "legacy UCS4";
953 default:
954 return "<legacy invalid kind>";
955 }
956 }
957 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600958 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200959 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200960 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200961 return "ascii";
962 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200967 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200968 default:
969 return "<invalid compact kind>";
970 }
971}
972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974/* Functions wrapping macros for use in debugger */
975char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200976 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977}
978
979void *_PyUnicode_compact_data(void *unicode) {
980 return _PyUnicode_COMPACT_DATA(unicode);
981}
982void *_PyUnicode_data(void *unicode){
983 printf("obj %p\n", unicode);
984 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
985 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
986 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
987 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
988 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
989 return PyUnicode_DATA(unicode);
990}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200991
992void
993_PyUnicode_Dump(PyObject *op)
994{
995 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200996 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
997 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
998 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200999
Victor Stinnera849a4b2011-10-03 12:12:11 +02001000 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001001 {
1002 if (ascii->state.ascii)
1003 data = (ascii + 1);
1004 else
1005 data = (compact + 1);
1006 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001007 else
1008 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001009 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1010
Victor Stinnera849a4b2011-10-03 12:12:11 +02001011 if (ascii->wstr == data)
1012 printf("shared ");
1013 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001014
Victor Stinnera3b334d2011-10-03 13:53:37 +02001015 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 printf(" (%zu), ", compact->wstr_length);
1017 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1018 printf("shared ");
1019 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001020 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001022}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023#endif
1024
1025PyObject *
1026PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1027{
1028 PyObject *obj;
1029 PyCompactUnicodeObject *unicode;
1030 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001031 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001032 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035
1036 /* Optimization for empty strings */
1037 if (size == 0 && unicode_empty != NULL) {
1038 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001039 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 }
1041
Victor Stinner9e9d6892011-10-04 01:02:02 +02001042 is_ascii = 0;
1043 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 struct_size = sizeof(PyCompactUnicodeObject);
1045 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001046 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 char_size = 1;
1048 is_ascii = 1;
1049 struct_size = sizeof(PyASCIIObject);
1050 }
1051 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001052 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 char_size = 1;
1054 }
1055 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001056 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 char_size = 2;
1058 if (sizeof(wchar_t) == 2)
1059 is_sharing = 1;
1060 }
1061 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001062 if (maxchar > MAX_UNICODE) {
1063 PyErr_SetString(PyExc_SystemError,
1064 "invalid maximum character passed to PyUnicode_New");
1065 return NULL;
1066 }
Victor Stinner8f825062012-04-27 13:55:39 +02001067 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 char_size = 4;
1069 if (sizeof(wchar_t) == 4)
1070 is_sharing = 1;
1071 }
1072
1073 /* Ensure we won't overflow the size. */
1074 if (size < 0) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "Negative size passed to PyUnicode_New");
1077 return NULL;
1078 }
1079 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1080 return PyErr_NoMemory();
1081
1082 /* Duplicated allocation code from _PyObject_New() instead of a call to
1083 * PyObject_New() so we are able to allocate space for the object and
1084 * it's data buffer.
1085 */
1086 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1087 if (obj == NULL)
1088 return PyErr_NoMemory();
1089 obj = PyObject_INIT(obj, &PyUnicode_Type);
1090 if (obj == NULL)
1091 return NULL;
1092
1093 unicode = (PyCompactUnicodeObject *)obj;
1094 if (is_ascii)
1095 data = ((PyASCIIObject*)obj) + 1;
1096 else
1097 data = unicode + 1;
1098 _PyUnicode_LENGTH(unicode) = size;
1099 _PyUnicode_HASH(unicode) = -1;
1100 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001101 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 _PyUnicode_STATE(unicode).compact = 1;
1103 _PyUnicode_STATE(unicode).ready = 1;
1104 _PyUnicode_STATE(unicode).ascii = is_ascii;
1105 if (is_ascii) {
1106 ((char*)data)[size] = 0;
1107 _PyUnicode_WSTR(unicode) = NULL;
1108 }
Victor Stinner8f825062012-04-27 13:55:39 +02001109 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 ((char*)data)[size] = 0;
1111 _PyUnicode_WSTR(unicode) = NULL;
1112 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001114 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 else {
1117 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001118 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001119 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001121 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 ((Py_UCS4*)data)[size] = 0;
1123 if (is_sharing) {
1124 _PyUnicode_WSTR_LENGTH(unicode) = size;
1125 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1126 }
1127 else {
1128 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1129 _PyUnicode_WSTR(unicode) = NULL;
1130 }
1131 }
Victor Stinner8f825062012-04-27 13:55:39 +02001132#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001133 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001134#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001135 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136 return obj;
1137}
1138
1139#if SIZEOF_WCHAR_T == 2
1140/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1141 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001142 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
1144 This function assumes that unicode can hold one more code point than wstr
1145 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001146static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001148 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149{
1150 const wchar_t *iter;
1151 Py_UCS4 *ucs4_out;
1152
Victor Stinner910337b2011-10-03 03:20:16 +02001153 assert(unicode != NULL);
1154 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1156 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1157
1158 for (iter = begin; iter < end; ) {
1159 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1160 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001161 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1162 && (iter+1) < end
1163 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 {
Victor Stinner551ac952011-11-29 22:58:13 +01001165 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 iter += 2;
1167 }
1168 else {
1169 *ucs4_out++ = *iter;
1170 iter++;
1171 }
1172 }
1173 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1174 _PyUnicode_GET_LENGTH(unicode)));
1175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176}
1177#endif
1178
Victor Stinnercd9950f2011-10-02 00:34:53 +02001179static int
Victor Stinner488fa492011-12-12 00:01:39 +01001180unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001181{
Victor Stinner488fa492011-12-12 00:01:39 +01001182 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001183 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001184 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001185 return -1;
1186 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001187 return 0;
1188}
1189
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001190static int
1191_copy_characters(PyObject *to, Py_ssize_t to_start,
1192 PyObject *from, Py_ssize_t from_start,
1193 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001195 unsigned int from_kind, to_kind;
1196 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001197
Victor Stinneree4544c2012-05-09 22:24:08 +02001198 assert(0 <= how_many);
1199 assert(0 <= from_start);
1200 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001201 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204
Victor Stinnerd3f08822012-05-29 12:57:52 +02001205 assert(PyUnicode_Check(to));
1206 assert(PyUnicode_IS_READY(to));
1207 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1208
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001209 if (how_many == 0)
1210 return 0;
1211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001215 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216
Victor Stinnerf1852262012-06-16 16:38:26 +02001217#ifdef Py_DEBUG
1218 if (!check_maxchar
1219 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1220 {
1221 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1222 Py_UCS4 ch;
1223 Py_ssize_t i;
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 assert(ch <= to_maxchar);
1227 }
1228 }
1229#endif
1230
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001231 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001232 if (check_maxchar
1233 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1234 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 /* Writing Latin-1 characters into an ASCII string requires to
1236 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 Py_UCS4 max_char;
1238 max_char = ucs1lib_find_max_char(from_data,
1239 (Py_UCS1*)from_data + how_many);
1240 if (max_char >= 128)
1241 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001243 Py_MEMCPY((char*)to_data + to_kind * to_start,
1244 (char*)from_data + from_kind * from_start,
1245 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001247 else if (from_kind == PyUnicode_1BYTE_KIND
1248 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001249 {
1250 _PyUnicode_CONVERT_BYTES(
1251 Py_UCS1, Py_UCS2,
1252 PyUnicode_1BYTE_DATA(from) + from_start,
1253 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1254 PyUnicode_2BYTE_DATA(to) + to_start
1255 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001256 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001257 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001258 && to_kind == PyUnicode_4BYTE_KIND)
1259 {
1260 _PyUnicode_CONVERT_BYTES(
1261 Py_UCS1, Py_UCS4,
1262 PyUnicode_1BYTE_DATA(from) + from_start,
1263 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1264 PyUnicode_4BYTE_DATA(to) + to_start
1265 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001266 }
1267 else if (from_kind == PyUnicode_2BYTE_KIND
1268 && to_kind == PyUnicode_4BYTE_KIND)
1269 {
1270 _PyUnicode_CONVERT_BYTES(
1271 Py_UCS2, Py_UCS4,
1272 PyUnicode_2BYTE_DATA(from) + from_start,
1273 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1274 PyUnicode_4BYTE_DATA(to) + to_start
1275 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001276 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001277 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001278 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1279
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001280 if (!check_maxchar) {
1281 if (from_kind == PyUnicode_2BYTE_KIND
1282 && to_kind == PyUnicode_1BYTE_KIND)
1283 {
1284 _PyUnicode_CONVERT_BYTES(
1285 Py_UCS2, Py_UCS1,
1286 PyUnicode_2BYTE_DATA(from) + from_start,
1287 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1288 PyUnicode_1BYTE_DATA(to) + to_start
1289 );
1290 }
1291 else if (from_kind == PyUnicode_4BYTE_KIND
1292 && to_kind == PyUnicode_1BYTE_KIND)
1293 {
1294 _PyUnicode_CONVERT_BYTES(
1295 Py_UCS4, Py_UCS1,
1296 PyUnicode_4BYTE_DATA(from) + from_start,
1297 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1298 PyUnicode_1BYTE_DATA(to) + to_start
1299 );
1300 }
1301 else if (from_kind == PyUnicode_4BYTE_KIND
1302 && to_kind == PyUnicode_2BYTE_KIND)
1303 {
1304 _PyUnicode_CONVERT_BYTES(
1305 Py_UCS4, Py_UCS2,
1306 PyUnicode_4BYTE_DATA(from) + from_start,
1307 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1308 PyUnicode_2BYTE_DATA(to) + to_start
1309 );
1310 }
1311 else {
1312 assert(0);
1313 return -1;
1314 }
1315 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001316 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001317 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001318 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001319 Py_ssize_t i;
1320
Victor Stinnera0702ab2011-09-29 14:14:38 +02001321 for (i=0; i < how_many; i++) {
1322 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001323 if (ch > to_maxchar)
1324 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001325 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1326 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001327 }
1328 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001329 return 0;
1330}
1331
Victor Stinnerd3f08822012-05-29 12:57:52 +02001332void
1333_PyUnicode_FastCopyCharacters(
1334 PyObject *to, Py_ssize_t to_start,
1335 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001336{
1337 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1338}
1339
1340Py_ssize_t
1341PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1342 PyObject *from, Py_ssize_t from_start,
1343 Py_ssize_t how_many)
1344{
1345 int err;
1346
1347 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1348 PyErr_BadInternalCall();
1349 return -1;
1350 }
1351
Benjamin Petersonbac79492012-01-14 13:34:47 -05001352 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001353 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001354 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001355 return -1;
1356
Victor Stinnerd3f08822012-05-29 12:57:52 +02001357 if (from_start < 0) {
1358 PyErr_SetString(PyExc_IndexError, "string index out of range");
1359 return -1;
1360 }
1361 if (to_start < 0) {
1362 PyErr_SetString(PyExc_IndexError, "string index out of range");
1363 return -1;
1364 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1366 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1367 PyErr_Format(PyExc_SystemError,
1368 "Cannot write %zi characters at %zi "
1369 "in a string of %zi characters",
1370 how_many, to_start, PyUnicode_GET_LENGTH(to));
1371 return -1;
1372 }
1373
1374 if (how_many == 0)
1375 return 0;
1376
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001378 return -1;
1379
1380 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1381 if (err) {
1382 PyErr_Format(PyExc_SystemError,
1383 "Cannot copy %s characters "
1384 "into a string of %s characters",
1385 unicode_kind_name(from),
1386 unicode_kind_name(to));
1387 return -1;
1388 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390}
1391
Victor Stinner17222162011-09-28 22:15:37 +02001392/* Find the maximum code point and count the number of surrogate pairs so a
1393 correct string length can be computed before converting a string to UCS4.
1394 This function counts single surrogates as a character and not as a pair.
1395
1396 Return 0 on success, or -1 on error. */
1397static int
1398find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1399 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400{
1401 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001402 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403
Victor Stinnerc53be962011-10-02 21:33:54 +02001404 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 *num_surrogates = 0;
1406 *maxchar = 0;
1407
1408 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001410 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1411 && (iter+1) < end
1412 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1413 {
1414 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1415 ++(*num_surrogates);
1416 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 }
1418 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001420 {
1421 ch = *iter;
1422 iter++;
1423 }
1424 if (ch > *maxchar) {
1425 *maxchar = ch;
1426 if (*maxchar > MAX_UNICODE) {
1427 PyErr_Format(PyExc_ValueError,
1428 "character U+%x is not in range [U+0000; U+10ffff]",
1429 ch);
1430 return -1;
1431 }
1432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 return 0;
1435}
1436
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001437int
1438_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439{
1440 wchar_t *end;
1441 Py_UCS4 maxchar = 0;
1442 Py_ssize_t num_surrogates;
1443#if SIZEOF_WCHAR_T == 2
1444 Py_ssize_t length_wo_surrogates;
1445#endif
1446
Georg Brandl7597add2011-10-05 16:36:47 +02001447 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001448 strings were created using _PyObject_New() and where no canonical
1449 representation (the str field) has been set yet aka strings
1450 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001451 assert(_PyUnicode_CHECK(unicode));
1452 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001454 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001455 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001456 /* Actually, it should neither be interned nor be anything else: */
1457 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001460 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
1464 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001465 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1466 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 PyErr_NoMemory();
1468 return -1;
1469 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001470 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 _PyUnicode_WSTR(unicode), end,
1472 PyUnicode_1BYTE_DATA(unicode));
1473 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1474 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1475 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1476 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001477 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001478 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001479 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 }
1481 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001483 _PyUnicode_UTF8(unicode) = NULL;
1484 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 PyObject_FREE(_PyUnicode_WSTR(unicode));
1487 _PyUnicode_WSTR(unicode) = NULL;
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 }
1490 /* In this case we might have to convert down from 4-byte native
1491 wchar_t to 2-byte unicode. */
1492 else if (maxchar < 65536) {
1493 assert(num_surrogates == 0 &&
1494 "FindMaxCharAndNumSurrogatePairs() messed up");
1495
Victor Stinner506f5922011-09-28 22:34:18 +02001496#if SIZEOF_WCHAR_T == 2
1497 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001498 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001499 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1500 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1501 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001502 _PyUnicode_UTF8(unicode) = NULL;
1503 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001504#else
1505 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001507 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001508 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001509 PyErr_NoMemory();
1510 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 }
Victor Stinner506f5922011-09-28 22:34:18 +02001512 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1513 _PyUnicode_WSTR(unicode), end,
1514 PyUnicode_2BYTE_DATA(unicode));
1515 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1516 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1517 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001518 _PyUnicode_UTF8(unicode) = NULL;
1519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001520 PyObject_FREE(_PyUnicode_WSTR(unicode));
1521 _PyUnicode_WSTR(unicode) = NULL;
1522 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1523#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 }
1525 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1526 else {
1527#if SIZEOF_WCHAR_T == 2
1528 /* in case the native representation is 2-bytes, we need to allocate a
1529 new normalized 4-byte version. */
1530 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001531 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1532 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 PyErr_NoMemory();
1534 return -1;
1535 }
1536 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1537 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001538 _PyUnicode_UTF8(unicode) = NULL;
1539 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001540 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1541 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001542 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543 PyObject_FREE(_PyUnicode_WSTR(unicode));
1544 _PyUnicode_WSTR(unicode) = NULL;
1545 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1546#else
1547 assert(num_surrogates == 0);
1548
Victor Stinnerc3c74152011-10-02 20:39:55 +02001549 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001551 _PyUnicode_UTF8(unicode) = NULL;
1552 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1554#endif
1555 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1556 }
1557 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001558 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 return 0;
1560}
1561
Alexander Belopolsky40018472011-02-26 01:02:56 +00001562static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001563unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564{
Walter Dörwald16807132007-05-25 13:52:07 +00001565 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001566 case SSTATE_NOT_INTERNED:
1567 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001568
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 case SSTATE_INTERNED_MORTAL:
1570 /* revive dead object temporarily for DelItem */
1571 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001572 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 Py_FatalError(
1574 "deletion of interned string failed");
1575 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001576
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 case SSTATE_INTERNED_IMMORTAL:
1578 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001579
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 default:
1581 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001582 }
1583
Victor Stinner03490912011-10-03 23:45:12 +02001584 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001586 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001587 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001588 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1589 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001591 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592}
1593
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001594#ifdef Py_DEBUG
1595static int
1596unicode_is_singleton(PyObject *unicode)
1597{
1598 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1599 if (unicode == unicode_empty)
1600 return 1;
1601 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1602 {
1603 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1604 if (ch < 256 && unicode_latin1[ch] == unicode)
1605 return 1;
1606 }
1607 return 0;
1608}
1609#endif
1610
Alexander Belopolsky40018472011-02-26 01:02:56 +00001611static int
Victor Stinner488fa492011-12-12 00:01:39 +01001612unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001613{
Victor Stinner488fa492011-12-12 00:01:39 +01001614 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 if (Py_REFCNT(unicode) != 1)
1616 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001617 if (_PyUnicode_HASH(unicode) != -1)
1618 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001619 if (PyUnicode_CHECK_INTERNED(unicode))
1620 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001621 if (!PyUnicode_CheckExact(unicode))
1622 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001623#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001624 /* singleton refcount is greater than 1 */
1625 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001626#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001627 return 1;
1628}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629
Victor Stinnerfe226c02011-10-03 03:52:20 +02001630static int
1631unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1632{
1633 PyObject *unicode;
1634 Py_ssize_t old_length;
1635
1636 assert(p_unicode != NULL);
1637 unicode = *p_unicode;
1638
1639 assert(unicode != NULL);
1640 assert(PyUnicode_Check(unicode));
1641 assert(0 <= length);
1642
Victor Stinner910337b2011-10-03 03:20:16 +02001643 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001644 old_length = PyUnicode_WSTR_LENGTH(unicode);
1645 else
1646 old_length = PyUnicode_GET_LENGTH(unicode);
1647 if (old_length == length)
1648 return 0;
1649
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001650 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001651 _Py_INCREF_UNICODE_EMPTY();
1652 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001653 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 Py_DECREF(*p_unicode);
1655 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001656 return 0;
1657 }
1658
Victor Stinner488fa492011-12-12 00:01:39 +01001659 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001660 PyObject *copy = resize_copy(unicode, length);
1661 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001663 Py_DECREF(*p_unicode);
1664 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001665 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666 }
1667
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001669 PyObject *new_unicode = resize_compact(unicode, length);
1670 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001671 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001672 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001674 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001675 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001676}
1677
Alexander Belopolsky40018472011-02-26 01:02:56 +00001678int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001679PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001680{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001681 PyObject *unicode;
1682 if (p_unicode == NULL) {
1683 PyErr_BadInternalCall();
1684 return -1;
1685 }
1686 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001687 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688 {
1689 PyErr_BadInternalCall();
1690 return -1;
1691 }
1692 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001693}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001694
Victor Stinnerc5166102012-02-22 13:55:02 +01001695/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001696
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001697 WARNING: The function doesn't copy the terminating null character and
1698 doesn't check the maximum character (may write a latin1 character in an
1699 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001700static void
1701unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1702 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001703{
1704 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1705 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001706 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001707
1708 switch (kind) {
1709 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001710 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001711#ifdef Py_DEBUG
1712 if (PyUnicode_IS_ASCII(unicode)) {
1713 Py_UCS4 maxchar = ucs1lib_find_max_char(
1714 (const Py_UCS1*)str,
1715 (const Py_UCS1*)str + len);
1716 assert(maxchar < 128);
1717 }
1718#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001719 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001720 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001721 }
1722 case PyUnicode_2BYTE_KIND: {
1723 Py_UCS2 *start = (Py_UCS2 *)data + index;
1724 Py_UCS2 *ucs2 = start;
1725 assert(index <= PyUnicode_GET_LENGTH(unicode));
1726
Victor Stinner184252a2012-06-16 02:57:41 +02001727 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 *ucs2 = (Py_UCS2)*str;
1729
1730 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001731 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 }
1733 default: {
1734 Py_UCS4 *start = (Py_UCS4 *)data + index;
1735 Py_UCS4 *ucs4 = start;
1736 assert(kind == PyUnicode_4BYTE_KIND);
1737 assert(index <= PyUnicode_GET_LENGTH(unicode));
1738
Victor Stinner184252a2012-06-16 02:57:41 +02001739 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001740 *ucs4 = (Py_UCS4)*str;
1741
1742 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001743 }
1744 }
1745}
1746
1747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748static PyObject*
1749get_latin1_char(unsigned char ch)
1750{
Victor Stinnera464fc12011-10-02 20:39:30 +02001751 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001753 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 if (!unicode)
1755 return NULL;
1756 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001757 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 unicode_latin1[ch] = unicode;
1759 }
1760 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001761 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Alexander Belopolsky40018472011-02-26 01:02:56 +00001764PyObject *
1765PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001767 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 Py_UCS4 maxchar = 0;
1769 Py_ssize_t num_surrogates;
1770
1771 if (u == NULL)
1772 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001774 /* If the Unicode data is known at construction time, we can apply
1775 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001778 if (size == 0)
1779 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 /* Single character Unicode objects in the Latin-1 range are
1782 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001783 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 return get_latin1_char((unsigned char)*u);
1785
1786 /* If not empty and not single character, copy the Unicode data
1787 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001788 if (find_maxchar_surrogates(u, u + size,
1789 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 return NULL;
1791
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 if (!unicode)
1794 return NULL;
1795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 switch (PyUnicode_KIND(unicode)) {
1797 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001798 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1800 break;
1801 case PyUnicode_2BYTE_KIND:
1802#if Py_UNICODE_SIZE == 2
1803 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1804#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001805 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1807#endif
1808 break;
1809 case PyUnicode_4BYTE_KIND:
1810#if SIZEOF_WCHAR_T == 2
1811 /* This is the only case which has to process surrogates, thus
1812 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001813 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814#else
1815 assert(num_surrogates == 0);
1816 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1817#endif
1818 break;
1819 default:
1820 assert(0 && "Impossible state");
1821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001823 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824}
1825
Alexander Belopolsky40018472011-02-26 01:02:56 +00001826PyObject *
1827PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001828{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001829 if (size < 0) {
1830 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001831 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 return NULL;
1833 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001834 if (u != NULL)
1835 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1836 else
1837 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001838}
1839
Alexander Belopolsky40018472011-02-26 01:02:56 +00001840PyObject *
1841PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001842{
1843 size_t size = strlen(u);
1844 if (size > PY_SSIZE_T_MAX) {
1845 PyErr_SetString(PyExc_OverflowError, "input too long");
1846 return NULL;
1847 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001848 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001849}
1850
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001851PyObject *
1852_PyUnicode_FromId(_Py_Identifier *id)
1853{
1854 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001855 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1856 strlen(id->string),
1857 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001858 if (!id->object)
1859 return NULL;
1860 PyUnicode_InternInPlace(&id->object);
1861 assert(!id->next);
1862 id->next = static_strings;
1863 static_strings = id;
1864 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001865 return id->object;
1866}
1867
1868void
1869_PyUnicode_ClearStaticStrings()
1870{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001871 _Py_Identifier *tmp, *s = static_strings;
1872 while (s) {
1873 Py_DECREF(s->object);
1874 s->object = NULL;
1875 tmp = s->next;
1876 s->next = NULL;
1877 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001878 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001879 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001880}
1881
Benjamin Peterson0df54292012-03-26 14:50:32 -04001882/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001883
Victor Stinnerd3f08822012-05-29 12:57:52 +02001884PyObject*
1885_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001886{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001887 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001888 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001889 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001890#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001891 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001892#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001893 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001894 }
Victor Stinner785938e2011-12-11 20:09:03 +01001895 unicode = PyUnicode_New(size, 127);
1896 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001897 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001898 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1899 assert(_PyUnicode_CheckConsistency(unicode, 1));
1900 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001901}
1902
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001903static Py_UCS4
1904kind_maxchar_limit(unsigned int kind)
1905{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001906 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907 case PyUnicode_1BYTE_KIND:
1908 return 0x80;
1909 case PyUnicode_2BYTE_KIND:
1910 return 0x100;
1911 case PyUnicode_4BYTE_KIND:
1912 return 0x10000;
1913 default:
1914 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001915 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001916 }
1917}
1918
Victor Stinnere6abb482012-05-02 01:15:40 +02001919Py_LOCAL_INLINE(Py_UCS4)
1920align_maxchar(Py_UCS4 maxchar)
1921{
1922 if (maxchar <= 127)
1923 return 127;
1924 else if (maxchar <= 255)
1925 return 255;
1926 else if (maxchar <= 65535)
1927 return 65535;
1928 else
1929 return MAX_UNICODE;
1930}
1931
Victor Stinner702c7342011-10-05 13:50:52 +02001932static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001933_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001937
Serhiy Storchaka678db842013-01-26 12:16:36 +02001938 if (size == 0)
1939 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001940 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001941 if (size == 1)
1942 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001943
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001945 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 if (!res)
1947 return NULL;
1948 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001949 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001951}
1952
Victor Stinnere57b1c02011-09-28 22:20:48 +02001953static PyObject*
1954_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955{
1956 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001957 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001958
Serhiy Storchaka678db842013-01-26 12:16:36 +02001959 if (size == 0)
1960 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001961 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001962 if (size == 1) {
1963 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001964 int kind;
1965 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001966 if (ch < 256)
1967 return get_latin1_char((unsigned char)ch);
1968
1969 res = PyUnicode_New(1, ch);
1970 if (res == NULL)
1971 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001972 kind = PyUnicode_KIND(res);
1973 data = PyUnicode_DATA(res);
1974 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001975 assert(_PyUnicode_CheckConsistency(res, 1));
1976 return res;
1977 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001979 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 if (!res)
1982 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001983 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001985 else {
1986 _PyUnicode_CONVERT_BYTES(
1987 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1988 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001989 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 return res;
1991}
1992
Victor Stinnere57b1c02011-09-28 22:20:48 +02001993static PyObject*
1994_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995{
1996 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998
Serhiy Storchaka678db842013-01-26 12:16:36 +02001999 if (size == 0)
2000 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002002 if (size == 1) {
2003 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002004 int kind;
2005 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002006 if (ch < 256)
2007 return get_latin1_char((unsigned char)ch);
2008
2009 res = PyUnicode_New(1, ch);
2010 if (res == NULL)
2011 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002012 kind = PyUnicode_KIND(res);
2013 data = PyUnicode_DATA(res);
2014 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002015 assert(_PyUnicode_CheckConsistency(res, 1));
2016 return res;
2017 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002019 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002020 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 if (!res)
2022 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002023 if (max_char < 256)
2024 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2025 PyUnicode_1BYTE_DATA(res));
2026 else if (max_char < 0x10000)
2027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2028 PyUnicode_2BYTE_DATA(res));
2029 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002031 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 return res;
2033}
2034
2035PyObject*
2036PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2037{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002038 if (size < 0) {
2039 PyErr_SetString(PyExc_ValueError, "size must be positive");
2040 return NULL;
2041 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002042 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002044 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002046 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002048 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002049 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002050 PyErr_SetString(PyExc_SystemError, "invalid kind");
2051 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053}
2054
Victor Stinnerece58de2012-04-23 23:36:38 +02002055Py_UCS4
2056_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2057{
2058 enum PyUnicode_Kind kind;
2059 void *startptr, *endptr;
2060
2061 assert(PyUnicode_IS_READY(unicode));
2062 assert(0 <= start);
2063 assert(end <= PyUnicode_GET_LENGTH(unicode));
2064 assert(start <= end);
2065
2066 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2067 return PyUnicode_MAX_CHAR_VALUE(unicode);
2068
2069 if (start == end)
2070 return 127;
2071
Victor Stinner94d558b2012-04-27 22:26:58 +02002072 if (PyUnicode_IS_ASCII(unicode))
2073 return 127;
2074
Victor Stinnerece58de2012-04-23 23:36:38 +02002075 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002076 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002077 endptr = (char *)startptr + end * kind;
2078 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002079 switch(kind) {
2080 case PyUnicode_1BYTE_KIND:
2081 return ucs1lib_find_max_char(startptr, endptr);
2082 case PyUnicode_2BYTE_KIND:
2083 return ucs2lib_find_max_char(startptr, endptr);
2084 case PyUnicode_4BYTE_KIND:
2085 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002086 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002087 assert(0);
2088 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 }
2090}
2091
Victor Stinner25a4b292011-10-06 12:31:55 +02002092/* Ensure that a string uses the most efficient storage, if it is not the
2093 case: create a new string with of the right kind. Write NULL into *p_unicode
2094 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002095static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002096unicode_adjust_maxchar(PyObject **p_unicode)
2097{
2098 PyObject *unicode, *copy;
2099 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002100 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002101 unsigned int kind;
2102
2103 assert(p_unicode != NULL);
2104 unicode = *p_unicode;
2105 assert(PyUnicode_IS_READY(unicode));
2106 if (PyUnicode_IS_ASCII(unicode))
2107 return;
2108
2109 len = PyUnicode_GET_LENGTH(unicode);
2110 kind = PyUnicode_KIND(unicode);
2111 if (kind == PyUnicode_1BYTE_KIND) {
2112 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002113 max_char = ucs1lib_find_max_char(u, u + len);
2114 if (max_char >= 128)
2115 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002116 }
2117 else if (kind == PyUnicode_2BYTE_KIND) {
2118 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002119 max_char = ucs2lib_find_max_char(u, u + len);
2120 if (max_char >= 256)
2121 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 }
2123 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002124 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002125 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002126 max_char = ucs4lib_find_max_char(u, u + len);
2127 if (max_char >= 0x10000)
2128 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002131 if (copy != NULL)
2132 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 Py_DECREF(unicode);
2134 *p_unicode = copy;
2135}
2136
Victor Stinner034f6cf2011-09-30 02:26:44 +02002137PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002138_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002139{
Victor Stinner87af4f22011-11-21 23:03:47 +01002140 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002141 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002142
Victor Stinner034f6cf2011-09-30 02:26:44 +02002143 if (!PyUnicode_Check(unicode)) {
2144 PyErr_BadInternalCall();
2145 return NULL;
2146 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002147 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002148 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149
Victor Stinner87af4f22011-11-21 23:03:47 +01002150 length = PyUnicode_GET_LENGTH(unicode);
2151 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152 if (!copy)
2153 return NULL;
2154 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2155
Victor Stinner87af4f22011-11-21 23:03:47 +01002156 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2157 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002158 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002159 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002160}
2161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163/* Widen Unicode objects to larger buffers. Don't write terminating null
2164 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165
2166void*
2167_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2168{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002169 Py_ssize_t len;
2170 void *result;
2171 unsigned int skind;
2172
Benjamin Petersonbac79492012-01-14 13:34:47 -05002173 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 return NULL;
2175
2176 len = PyUnicode_GET_LENGTH(s);
2177 skind = PyUnicode_KIND(s);
2178 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002179 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return NULL;
2181 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002182 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 case PyUnicode_2BYTE_KIND:
2184 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2185 if (!result)
2186 return PyErr_NoMemory();
2187 assert(skind == PyUnicode_1BYTE_KIND);
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS1, Py_UCS2,
2190 PyUnicode_1BYTE_DATA(s),
2191 PyUnicode_1BYTE_DATA(s) + len,
2192 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002194 case PyUnicode_4BYTE_KIND:
2195 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2196 if (!result)
2197 return PyErr_NoMemory();
2198 if (skind == PyUnicode_2BYTE_KIND) {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS4,
2201 PyUnicode_2BYTE_DATA(s),
2202 PyUnicode_2BYTE_DATA(s) + len,
2203 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002205 else {
2206 assert(skind == PyUnicode_1BYTE_KIND);
2207 _PyUnicode_CONVERT_BYTES(
2208 Py_UCS1, Py_UCS4,
2209 PyUnicode_1BYTE_DATA(s),
2210 PyUnicode_1BYTE_DATA(s) + len,
2211 result);
2212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002214 default:
2215 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 }
Victor Stinner01698042011-10-04 00:04:26 +02002217 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return NULL;
2219}
2220
2221static Py_UCS4*
2222as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2223 int copy_null)
2224{
2225 int kind;
2226 void *data;
2227 Py_ssize_t len, targetlen;
2228 if (PyUnicode_READY(string) == -1)
2229 return NULL;
2230 kind = PyUnicode_KIND(string);
2231 data = PyUnicode_DATA(string);
2232 len = PyUnicode_GET_LENGTH(string);
2233 targetlen = len;
2234 if (copy_null)
2235 targetlen++;
2236 if (!target) {
2237 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2238 PyErr_NoMemory();
2239 return NULL;
2240 }
2241 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2242 if (!target) {
2243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 }
2247 else {
2248 if (targetsize < targetlen) {
2249 PyErr_Format(PyExc_SystemError,
2250 "string is longer than the buffer");
2251 if (copy_null && 0 < targetsize)
2252 target[0] = 0;
2253 return NULL;
2254 }
2255 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 if (kind == PyUnicode_1BYTE_KIND) {
2257 Py_UCS1 *start = (Py_UCS1 *) data;
2258 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 else if (kind == PyUnicode_2BYTE_KIND) {
2261 Py_UCS2 *start = (Py_UCS2 *) data;
2262 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2263 }
2264 else {
2265 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 if (copy_null)
2269 target[len] = 0;
2270 return target;
2271}
2272
2273Py_UCS4*
2274PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2275 int copy_null)
2276{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002277 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 PyErr_BadInternalCall();
2279 return NULL;
2280 }
2281 return as_ucs4(string, target, targetsize, copy_null);
2282}
2283
2284Py_UCS4*
2285PyUnicode_AsUCS4Copy(PyObject *string)
2286{
2287 return as_ucs4(string, NULL, 0, 1);
2288}
2289
2290#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002291
Alexander Belopolsky40018472011-02-26 01:02:56 +00002292PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002293PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002297 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 PyErr_BadInternalCall();
2299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 }
2301
Martin v. Löwis790465f2008-04-05 20:41:37 +00002302 if (size == -1) {
2303 size = wcslen(w);
2304 }
2305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307}
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002310
Walter Dörwald346737f2007-05-31 10:44:43 +00002311static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002312makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002313 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002314{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002315 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002316 if (longflag)
2317 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002318 else if (longlongflag) {
2319 /* longlongflag should only ever be nonzero on machines with
2320 HAVE_LONG_LONG defined */
2321#ifdef HAVE_LONG_LONG
2322 char *f = PY_FORMAT_LONG_LONG;
2323 while (*f)
2324 *fmt++ = *f++;
2325#else
2326 /* we shouldn't ever get here */
2327 assert(0);
2328 *fmt++ = 'l';
2329#endif
2330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 else if (size_tflag) {
2332 char *f = PY_FORMAT_SIZE_T;
2333 while (*f)
2334 *fmt++ = *f++;
2335 }
2336 *fmt++ = c;
2337 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002338}
2339
Victor Stinner15a11362012-10-06 23:48:20 +02002340/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002341 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2342 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2343#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002344
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002345static int
2346unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2347 Py_ssize_t width, Py_ssize_t precision)
2348{
2349 Py_ssize_t length, fill, arglen;
2350 Py_UCS4 maxchar;
2351
2352 if (PyUnicode_READY(str) == -1)
2353 return -1;
2354
2355 length = PyUnicode_GET_LENGTH(str);
2356 if ((precision == -1 || precision >= length)
2357 && width <= length)
2358 return _PyUnicodeWriter_WriteStr(writer, str);
2359
2360 if (precision != -1)
2361 length = Py_MIN(precision, length);
2362
2363 arglen = Py_MAX(length, width);
2364 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2365 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2366 else
2367 maxchar = writer->maxchar;
2368
2369 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2370 return -1;
2371
2372 if (width > length) {
2373 fill = width - length;
2374 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2375 return -1;
2376 writer->pos += fill;
2377 }
2378
2379 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2380 str, 0, length);
2381 writer->pos += length;
2382 return 0;
2383}
2384
2385static int
2386unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2387 Py_ssize_t width, Py_ssize_t precision)
2388{
2389 /* UTF-8 */
2390 Py_ssize_t length;
2391 PyObject *unicode;
2392 int res;
2393
2394 length = strlen(str);
2395 if (precision != -1)
2396 length = Py_MIN(length, precision);
2397 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2398 if (unicode == NULL)
2399 return -1;
2400
2401 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2402 Py_DECREF(unicode);
2403 return res;
2404}
2405
Victor Stinner96865452011-03-01 23:44:09 +00002406static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002407unicode_fromformat_arg(_PyUnicodeWriter *writer,
2408 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002409{
Victor Stinnere215d962012-10-06 23:03:36 +02002410 const char *p;
2411 Py_ssize_t len;
2412 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 Py_ssize_t width;
2414 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002415 int longflag;
2416 int longlongflag;
2417 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002419
2420 p = f;
2421 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002422 zeropad = 0;
2423 if (*f == '0') {
2424 zeropad = 1;
2425 f++;
2426 }
Victor Stinner96865452011-03-01 23:44:09 +00002427
2428 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002429 width = -1;
2430 if (Py_ISDIGIT((unsigned)*f)) {
2431 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002432 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002433 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002434 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002435 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002436 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002437 return NULL;
2438 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002440 f++;
2441 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002442 }
2443 precision = -1;
2444 if (*f == '.') {
2445 f++;
2446 if (Py_ISDIGIT((unsigned)*f)) {
2447 precision = (*f - '0');
2448 f++;
2449 while (Py_ISDIGIT((unsigned)*f)) {
2450 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2451 PyErr_SetString(PyExc_ValueError,
2452 "precision too big");
2453 return NULL;
2454 }
2455 precision = (precision * 10) + (*f - '0');
2456 f++;
2457 }
2458 }
Victor Stinner96865452011-03-01 23:44:09 +00002459 if (*f == '%') {
2460 /* "%.3%s" => f points to "3" */
2461 f--;
2462 }
2463 }
2464 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002465 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002466 f--;
2467 }
Victor Stinner96865452011-03-01 23:44:09 +00002468
2469 /* Handle %ld, %lu, %lld and %llu. */
2470 longflag = 0;
2471 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002472 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002473 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002474 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002475 longflag = 1;
2476 ++f;
2477 }
2478#ifdef HAVE_LONG_LONG
2479 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002480 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002481 longlongflag = 1;
2482 f += 2;
2483 }
2484#endif
2485 }
2486 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002487 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002488 size_tflag = 1;
2489 ++f;
2490 }
Victor Stinnere215d962012-10-06 23:03:36 +02002491
2492 if (f[1] == '\0')
2493 writer->overallocate = 0;
2494
2495 switch (*f) {
2496 case 'c':
2497 {
2498 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002499 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002500 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002501 "character argument not in range(0x110000)");
2502 return NULL;
2503 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002504 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002505 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002506 break;
2507 }
2508
2509 case 'i':
2510 case 'd':
2511 case 'u':
2512 case 'x':
2513 {
2514 /* used by sprintf */
2515 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002516 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002517 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002518
2519 if (*f == 'u') {
2520 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2521
2522 if (longflag)
2523 len = sprintf(buffer, fmt,
2524 va_arg(*vargs, unsigned long));
2525#ifdef HAVE_LONG_LONG
2526 else if (longlongflag)
2527 len = sprintf(buffer, fmt,
2528 va_arg(*vargs, unsigned PY_LONG_LONG));
2529#endif
2530 else if (size_tflag)
2531 len = sprintf(buffer, fmt,
2532 va_arg(*vargs, size_t));
2533 else
2534 len = sprintf(buffer, fmt,
2535 va_arg(*vargs, unsigned int));
2536 }
2537 else if (*f == 'x') {
2538 makefmt(fmt, 0, 0, 0, 'x');
2539 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2540 }
2541 else {
2542 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2543
2544 if (longflag)
2545 len = sprintf(buffer, fmt,
2546 va_arg(*vargs, long));
2547#ifdef HAVE_LONG_LONG
2548 else if (longlongflag)
2549 len = sprintf(buffer, fmt,
2550 va_arg(*vargs, PY_LONG_LONG));
2551#endif
2552 else if (size_tflag)
2553 len = sprintf(buffer, fmt,
2554 va_arg(*vargs, Py_ssize_t));
2555 else
2556 len = sprintf(buffer, fmt,
2557 va_arg(*vargs, int));
2558 }
2559 assert(len >= 0);
2560
Victor Stinnere215d962012-10-06 23:03:36 +02002561 if (precision < len)
2562 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002563
2564 arglen = Py_MAX(precision, width);
2565 assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127);
2566 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2567 return NULL;
2568
Victor Stinnere215d962012-10-06 23:03:36 +02002569 if (width > precision) {
2570 Py_UCS4 fillchar;
2571 fill = width - precision;
2572 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002573 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2574 return NULL;
2575 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002576 }
Victor Stinner15a11362012-10-06 23:48:20 +02002577 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002578 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002579 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2580 return NULL;
2581 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002582 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583
2584 unicode_write_cstr(writer->buffer, writer->pos, buffer, len);
2585 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002586 break;
2587 }
2588
2589 case 'p':
2590 {
2591 char number[MAX_LONG_LONG_CHARS];
2592
2593 len = sprintf(number, "%p", va_arg(*vargs, void*));
2594 assert(len >= 0);
2595
2596 /* %p is ill-defined: ensure leading 0x. */
2597 if (number[1] == 'X')
2598 number[1] = 'x';
2599 else if (number[1] != 'x') {
2600 memmove(number + 2, number,
2601 strlen(number) + 1);
2602 number[0] = '0';
2603 number[1] = 'x';
2604 len += 2;
2605 }
2606
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002607 assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127);
2608 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002609 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002610 unicode_write_cstr(writer->buffer, writer->pos, number, len);
2611 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002612 break;
2613 }
2614
2615 case 's':
2616 {
2617 /* UTF-8 */
2618 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002620 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002621 break;
2622 }
2623
2624 case 'U':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 assert(obj && _PyUnicode_CHECK(obj));
2628
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002630 return NULL;
2631 break;
2632 }
2633
2634 case 'V':
2635 {
2636 PyObject *obj = va_arg(*vargs, PyObject *);
2637 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002638 if (obj) {
2639 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002640 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002641 return NULL;
2642 }
2643 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 assert(str != NULL);
2645 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002646 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002647 }
2648 break;
2649 }
2650
2651 case 'S':
2652 {
2653 PyObject *obj = va_arg(*vargs, PyObject *);
2654 PyObject *str;
2655 assert(obj);
2656 str = PyObject_Str(obj);
2657 if (!str)
2658 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002659 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002660 Py_DECREF(str);
2661 return NULL;
2662 }
2663 Py_DECREF(str);
2664 break;
2665 }
2666
2667 case 'R':
2668 {
2669 PyObject *obj = va_arg(*vargs, PyObject *);
2670 PyObject *repr;
2671 assert(obj);
2672 repr = PyObject_Repr(obj);
2673 if (!repr)
2674 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 Py_DECREF(repr);
2677 return NULL;
2678 }
2679 Py_DECREF(repr);
2680 break;
2681 }
2682
2683 case 'A':
2684 {
2685 PyObject *obj = va_arg(*vargs, PyObject *);
2686 PyObject *ascii;
2687 assert(obj);
2688 ascii = PyObject_ASCII(obj);
2689 if (!ascii)
2690 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002692 Py_DECREF(ascii);
2693 return NULL;
2694 }
2695 Py_DECREF(ascii);
2696 break;
2697 }
2698
2699 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002700 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002701 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002702 break;
2703
2704 default:
2705 /* if we stumble upon an unknown formatting code, copy the rest
2706 of the format string to the output string. (we cannot just
2707 skip the code, since there's no way to know what's in the
2708 argument list) */
2709 len = strlen(p);
2710 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2711 return NULL;
2712 f = p+len;
2713 return f;
2714 }
2715
2716 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002717 return f;
2718}
2719
Walter Dörwaldd2034312007-05-18 16:29:38 +00002720PyObject *
2721PyUnicode_FromFormatV(const char *format, va_list vargs)
2722{
Victor Stinnere215d962012-10-06 23:03:36 +02002723 va_list vargs2;
2724 const char *f;
2725 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002726
Victor Stinner8f674cc2013-04-17 23:02:17 +02002727 _PyUnicodeWriter_Init(&writer);
2728 writer.min_length = strlen(format) + 100;
2729 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002730
2731 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2732 Copy it to be able to pass a reference to a subfunction. */
2733 Py_VA_COPY(vargs2, vargs);
2734
2735 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002737 f = unicode_fromformat_arg(&writer, f, &vargs2);
2738 if (f == NULL)
2739 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 const char *p;
2743 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002744
Victor Stinnere215d962012-10-06 23:03:36 +02002745 p = f;
2746 do
2747 {
2748 if ((unsigned char)*p > 127) {
2749 PyErr_Format(PyExc_ValueError,
2750 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2751 "string, got a non-ASCII byte: 0x%02x",
2752 (unsigned char)*p);
2753 return NULL;
2754 }
2755 p++;
2756 }
2757 while (*p != '\0' && *p != '%');
2758 len = p - f;
2759
2760 if (*p == '\0')
2761 writer.overallocate = 0;
2762 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2763 goto fail;
2764 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2765 writer.pos += len;
2766
2767 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 }
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return _PyUnicodeWriter_Finish(&writer);
2771
2772 fail:
2773 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775}
2776
Walter Dörwaldd2034312007-05-18 16:29:38 +00002777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002780 PyObject* ret;
2781 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782
2783#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 ret = PyUnicode_FromFormatV(format, vargs);
2789 va_end(vargs);
2790 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002791}
2792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793#ifdef HAVE_WCHAR_H
2794
Victor Stinner5593d8a2010-10-02 11:11:27 +00002795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796 convert a Unicode object to a wide character string.
2797
Victor Stinnerd88d9832011-09-06 02:00:05 +02002798 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002799 character) required to convert the unicode object. Ignore size argument.
2800
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002803 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002806 wchar_t *w,
2807 Py_ssize_t size)
2808{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 const wchar_t *wstr;
2811
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (wstr == NULL)
2814 return -1;
2815
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002817 if (size > res)
2818 size = res + 1;
2819 else
2820 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002822 return res;
2823 }
2824 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002826}
2827
2828Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002829PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 wchar_t *w,
2831 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832{
2833 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 PyErr_BadInternalCall();
2835 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002837 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838}
2839
Victor Stinner137c34c2010-09-29 10:25:54 +00002840wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002841PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002842 Py_ssize_t *size)
2843{
2844 wchar_t* buffer;
2845 Py_ssize_t buflen;
2846
2847 if (unicode == NULL) {
2848 PyErr_BadInternalCall();
2849 return NULL;
2850 }
2851
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002852 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002853 if (buflen == -1)
2854 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002855 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002856 PyErr_NoMemory();
2857 return NULL;
2858 }
2859
Victor Stinner137c34c2010-09-29 10:25:54 +00002860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2861 if (buffer == NULL) {
2862 PyErr_NoMemory();
2863 return NULL;
2864 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002865 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002866 if (buflen == -1) {
2867 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002869 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 if (size != NULL)
2871 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002872 return buffer;
2873}
2874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002875#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Alexander Belopolsky40018472011-02-26 01:02:56 +00002877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002881 void *data;
2882 int kind;
2883
Victor Stinner8faf8212011-12-08 22:14:11 +01002884 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 PyErr_SetString(PyExc_ValueError,
2886 "chr() arg not in range(0x110000)");
2887 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002888 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002889
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002890 if ((Py_UCS4)ordinal < 256)
2891 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 v = PyUnicode_New(1, ordinal);
2894 if (v == NULL)
2895 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002896 kind = PyUnicode_KIND(v);
2897 data = PyUnicode_DATA(v);
2898 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002899 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002901}
2902
Alexander Belopolsky40018472011-02-26 01:02:56 +00002903PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002904PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002906 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002908 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002909 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002910 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 Py_INCREF(obj);
2912 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002913 }
2914 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 /* For a Unicode subtype that's not a Unicode object,
2916 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002917 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002918 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002919 PyErr_Format(PyExc_TypeError,
2920 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002921 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002922 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002923}
2924
Alexander Belopolsky40018472011-02-26 01:02:56 +00002925PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002926PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002927 const char *encoding,
2928 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002929{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002930 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002931 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002932
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 PyErr_BadInternalCall();
2935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Decoding bytes objects is the most common case and should be fast */
2939 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002940 if (PyBytes_GET_SIZE(obj) == 0)
2941 _Py_RETURN_UNICODE_EMPTY();
2942 v = PyUnicode_Decode(
2943 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2944 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002945 return v;
2946 }
2947
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002948 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 PyErr_SetString(PyExc_TypeError,
2950 "decoding str is not supported");
2951 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002953
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002954 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2955 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2956 PyErr_Format(PyExc_TypeError,
2957 "coercing to str: need bytes, bytearray "
2958 "or buffer-like object, %.80s found",
2959 Py_TYPE(obj)->tp_name);
2960 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002961 }
Tim Petersced69f82003-09-16 20:30:58 +00002962
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002963 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002964 PyBuffer_Release(&buffer);
2965 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002967
Serhiy Storchaka05997252013-01-26 12:14:02 +02002968 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002969 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002970 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971}
2972
Victor Stinner600d3be2010-06-10 12:00:55 +00002973/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002974 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2975 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002976int
2977_Py_normalize_encoding(const char *encoding,
2978 char *lower,
2979 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002981 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002982 char *l;
2983 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002985 if (encoding == NULL) {
2986 strcpy(lower, "utf-8");
2987 return 1;
2988 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002989 e = encoding;
2990 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002991 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002992 while (*e) {
2993 if (l == l_end)
2994 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002995 if (Py_ISUPPER(*e)) {
2996 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002997 }
2998 else if (*e == '_') {
2999 *l++ = '-';
3000 e++;
3001 }
3002 else {
3003 *l++ = *e++;
3004 }
3005 }
3006 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003007 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003008}
3009
Alexander Belopolsky40018472011-02-26 01:02:56 +00003010PyObject *
3011PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003012 Py_ssize_t size,
3013 const char *encoding,
3014 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003015{
3016 PyObject *buffer = NULL, *unicode;
3017 Py_buffer info;
3018 char lower[11]; /* Enough for any encoding shortcut */
3019
Fred Drakee4315f52000-05-09 19:53:39 +00003020 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003021 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003022 if ((strcmp(lower, "utf-8") == 0) ||
3023 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003024 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003025 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003026 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003027 (strcmp(lower, "iso-8859-1") == 0) ||
3028 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003029 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003030#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003031 else if (strcmp(lower, "mbcs") == 0)
3032 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003033#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003034 else if (strcmp(lower, "ascii") == 0)
3035 return PyUnicode_DecodeASCII(s, size, errors);
3036 else if (strcmp(lower, "utf-16") == 0)
3037 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3038 else if (strcmp(lower, "utf-32") == 0)
3039 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041
3042 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003043 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003044 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003045 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003046 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 if (buffer == NULL)
3048 goto onError;
3049 unicode = PyCodec_Decode(buffer, encoding, errors);
3050 if (unicode == NULL)
3051 goto onError;
3052 if (!PyUnicode_Check(unicode)) {
3053 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003054 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003055 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 Py_DECREF(unicode);
3057 goto onError;
3058 }
3059 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003060 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003061
Benjamin Peterson29060642009-01-31 22:14:21 +00003062 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 Py_XDECREF(buffer);
3064 return NULL;
3065}
3066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
3068PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 const char *encoding,
3070 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003071{
3072 PyObject *v;
3073
3074 if (!PyUnicode_Check(unicode)) {
3075 PyErr_BadArgument();
3076 goto onError;
3077 }
3078
3079 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003081
3082 /* Decode via the codec registry */
3083 v = PyCodec_Decode(unicode, encoding, errors);
3084 if (v == NULL)
3085 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003086 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003087
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003089 return NULL;
3090}
3091
Alexander Belopolsky40018472011-02-26 01:02:56 +00003092PyObject *
3093PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003094 const char *encoding,
3095 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003096{
3097 PyObject *v;
3098
3099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
3101 goto onError;
3102 }
3103
3104 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106
3107 /* Decode via the codec registry */
3108 v = PyCodec_Decode(unicode, encoding, errors);
3109 if (v == NULL)
3110 goto onError;
3111 if (!PyUnicode_Check(v)) {
3112 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003113 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003114 Py_TYPE(v)->tp_name);
3115 Py_DECREF(v);
3116 goto onError;
3117 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003118 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003119
Benjamin Peterson29060642009-01-31 22:14:21 +00003120 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003121 return NULL;
3122}
3123
Alexander Belopolsky40018472011-02-26 01:02:56 +00003124PyObject *
3125PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003126 Py_ssize_t size,
3127 const char *encoding,
3128 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129{
3130 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003131
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 unicode = PyUnicode_FromUnicode(s, size);
3133 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3136 Py_DECREF(unicode);
3137 return v;
3138}
3139
Alexander Belopolsky40018472011-02-26 01:02:56 +00003140PyObject *
3141PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003142 const char *encoding,
3143 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003144{
3145 PyObject *v;
3146
3147 if (!PyUnicode_Check(unicode)) {
3148 PyErr_BadArgument();
3149 goto onError;
3150 }
3151
3152 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154
3155 /* Encode via the codec registry */
3156 v = PyCodec_Encode(unicode, encoding, errors);
3157 if (v == NULL)
3158 goto onError;
3159 return v;
3160
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003162 return NULL;
3163}
3164
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003165static size_t
3166wcstombs_errorpos(const wchar_t *wstr)
3167{
3168 size_t len;
3169#if SIZEOF_WCHAR_T == 2
3170 wchar_t buf[3];
3171#else
3172 wchar_t buf[2];
3173#endif
3174 char outbuf[MB_LEN_MAX];
3175 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003176
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003177#if SIZEOF_WCHAR_T == 2
3178 buf[2] = 0;
3179#else
3180 buf[1] = 0;
3181#endif
3182 start = wstr;
3183 while (*wstr != L'\0')
3184 {
3185 previous = wstr;
3186#if SIZEOF_WCHAR_T == 2
3187 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3188 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3189 {
3190 buf[0] = wstr[0];
3191 buf[1] = wstr[1];
3192 wstr += 2;
3193 }
3194 else {
3195 buf[0] = *wstr;
3196 buf[1] = 0;
3197 wstr++;
3198 }
3199#else
3200 buf[0] = *wstr;
3201 wstr++;
3202#endif
3203 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003204 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003205 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003206 }
3207
3208 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003209 return 0;
3210}
3211
Victor Stinner1b579672011-12-17 05:47:23 +01003212static int
3213locale_error_handler(const char *errors, int *surrogateescape)
3214{
3215 if (errors == NULL) {
3216 *surrogateescape = 0;
3217 return 0;
3218 }
3219
3220 if (strcmp(errors, "strict") == 0) {
3221 *surrogateescape = 0;
3222 return 0;
3223 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003224 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003225 *surrogateescape = 1;
3226 return 0;
3227 }
3228 PyErr_Format(PyExc_ValueError,
3229 "only 'strict' and 'surrogateescape' error handlers "
3230 "are supported, not '%s'",
3231 errors);
3232 return -1;
3233}
3234
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003236PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003237{
3238 Py_ssize_t wlen, wlen2;
3239 wchar_t *wstr;
3240 PyObject *bytes = NULL;
3241 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003242 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003243 PyObject *exc;
3244 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003245 int surrogateescape;
3246
3247 if (locale_error_handler(errors, &surrogateescape) < 0)
3248 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003249
3250 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3251 if (wstr == NULL)
3252 return NULL;
3253
3254 wlen2 = wcslen(wstr);
3255 if (wlen2 != wlen) {
3256 PyMem_Free(wstr);
3257 PyErr_SetString(PyExc_TypeError, "embedded null character");
3258 return NULL;
3259 }
3260
3261 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003262 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003263 char *str;
3264
3265 str = _Py_wchar2char(wstr, &error_pos);
3266 if (str == NULL) {
3267 if (error_pos == (size_t)-1) {
3268 PyErr_NoMemory();
3269 PyMem_Free(wstr);
3270 return NULL;
3271 }
3272 else {
3273 goto encode_error;
3274 }
3275 }
3276 PyMem_Free(wstr);
3277
3278 bytes = PyBytes_FromString(str);
3279 PyMem_Free(str);
3280 }
3281 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003282 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003283 size_t len, len2;
3284
3285 len = wcstombs(NULL, wstr, 0);
3286 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003287 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003288 goto encode_error;
3289 }
3290
3291 bytes = PyBytes_FromStringAndSize(NULL, len);
3292 if (bytes == NULL) {
3293 PyMem_Free(wstr);
3294 return NULL;
3295 }
3296
3297 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3298 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003299 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003300 goto encode_error;
3301 }
3302 PyMem_Free(wstr);
3303 }
3304 return bytes;
3305
3306encode_error:
3307 errmsg = strerror(errno);
3308 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003309
3310 if (error_pos == (size_t)-1)
3311 error_pos = wcstombs_errorpos(wstr);
3312
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003313 PyMem_Free(wstr);
3314 Py_XDECREF(bytes);
3315
Victor Stinner2f197072011-12-17 07:08:30 +01003316 if (errmsg != NULL) {
3317 size_t errlen;
3318 wstr = _Py_char2wchar(errmsg, &errlen);
3319 if (wstr != NULL) {
3320 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003321 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003322 } else
3323 errmsg = NULL;
3324 }
3325 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003326 reason = PyUnicode_FromString(
3327 "wcstombs() encountered an unencodable "
3328 "wide character");
3329 if (reason == NULL)
3330 return NULL;
3331
3332 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3333 "locale", unicode,
3334 (Py_ssize_t)error_pos,
3335 (Py_ssize_t)(error_pos+1),
3336 reason);
3337 Py_DECREF(reason);
3338 if (exc != NULL) {
3339 PyCodec_StrictErrors(exc);
3340 Py_XDECREF(exc);
3341 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342 return NULL;
3343}
3344
Victor Stinnerad158722010-10-27 00:25:46 +00003345PyObject *
3346PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003347{
Victor Stinner99b95382011-07-04 14:23:54 +02003348#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003349 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003350#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003352#else
Victor Stinner793b5312011-04-27 00:24:21 +02003353 PyInterpreterState *interp = PyThreadState_GET()->interp;
3354 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3355 cannot use it to encode and decode filenames before it is loaded. Load
3356 the Python codec requires to encode at least its own filename. Use the C
3357 version of the locale codec until the codec registry is initialized and
3358 the Python codec is loaded.
3359
3360 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3361 cannot only rely on it: check also interp->fscodec_initialized for
3362 subinterpreters. */
3363 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003364 return PyUnicode_AsEncodedString(unicode,
3365 Py_FileSystemDefaultEncoding,
3366 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003367 }
3368 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003369 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003370 }
Victor Stinnerad158722010-10-27 00:25:46 +00003371#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003372}
3373
Alexander Belopolsky40018472011-02-26 01:02:56 +00003374PyObject *
3375PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003376 const char *encoding,
3377 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378{
3379 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003380 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003381
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 if (!PyUnicode_Check(unicode)) {
3383 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 }
Fred Drakee4315f52000-05-09 19:53:39 +00003386
Fred Drakee4315f52000-05-09 19:53:39 +00003387 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003388 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003389 if ((strcmp(lower, "utf-8") == 0) ||
3390 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003391 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003392 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003394 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003396 }
Victor Stinner37296e82010-06-10 13:36:23 +00003397 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003398 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003399 (strcmp(lower, "iso-8859-1") == 0) ||
3400 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003401 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003402#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003403 else if (strcmp(lower, "mbcs") == 0)
3404 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003405#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003406 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003407 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409
3410 /* Encode via the codec registry */
3411 v = PyCodec_Encode(unicode, encoding, errors);
3412 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003413 return NULL;
3414
3415 /* The normal path */
3416 if (PyBytes_Check(v))
3417 return v;
3418
3419 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003420 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003421 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003422 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003423
3424 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3425 "encoder %s returned bytearray instead of bytes",
3426 encoding);
3427 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003428 Py_DECREF(v);
3429 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003430 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003431
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003432 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3433 Py_DECREF(v);
3434 return b;
3435 }
3436
3437 PyErr_Format(PyExc_TypeError,
3438 "encoder did not return a bytes object (type=%.400s)",
3439 Py_TYPE(v)->tp_name);
3440 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003441 return NULL;
3442}
3443
Alexander Belopolsky40018472011-02-26 01:02:56 +00003444PyObject *
3445PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003446 const char *encoding,
3447 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003448{
3449 PyObject *v;
3450
3451 if (!PyUnicode_Check(unicode)) {
3452 PyErr_BadArgument();
3453 goto onError;
3454 }
3455
3456 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003457 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003458
3459 /* Encode via the codec registry */
3460 v = PyCodec_Encode(unicode, encoding, errors);
3461 if (v == NULL)
3462 goto onError;
3463 if (!PyUnicode_Check(v)) {
3464 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003465 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003466 Py_TYPE(v)->tp_name);
3467 Py_DECREF(v);
3468 goto onError;
3469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003471
Benjamin Peterson29060642009-01-31 22:14:21 +00003472 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473 return NULL;
3474}
3475
Victor Stinner2f197072011-12-17 07:08:30 +01003476static size_t
3477mbstowcs_errorpos(const char *str, size_t len)
3478{
3479#ifdef HAVE_MBRTOWC
3480 const char *start = str;
3481 mbstate_t mbs;
3482 size_t converted;
3483 wchar_t ch;
3484
3485 memset(&mbs, 0, sizeof mbs);
3486 while (len)
3487 {
3488 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3489 if (converted == 0)
3490 /* Reached end of string */
3491 break;
3492 if (converted == (size_t)-1 || converted == (size_t)-2) {
3493 /* Conversion error or incomplete character */
3494 return str - start;
3495 }
3496 else {
3497 str += converted;
3498 len -= converted;
3499 }
3500 }
3501 /* failed to find the undecodable byte sequence */
3502 return 0;
3503#endif
3504 return 0;
3505}
3506
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003507PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003508PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003509 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003510{
3511 wchar_t smallbuf[256];
3512 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3513 wchar_t *wstr;
3514 size_t wlen, wlen2;
3515 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003516 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003517 size_t error_pos;
3518 char *errmsg;
3519 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003520
3521 if (locale_error_handler(errors, &surrogateescape) < 0)
3522 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003523
3524 if (str[len] != '\0' || len != strlen(str)) {
3525 PyErr_SetString(PyExc_TypeError, "embedded null character");
3526 return NULL;
3527 }
3528
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003529 if (surrogateescape) {
3530 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531 wstr = _Py_char2wchar(str, &wlen);
3532 if (wstr == NULL) {
3533 if (wlen == (size_t)-1)
3534 PyErr_NoMemory();
3535 else
3536 PyErr_SetFromErrno(PyExc_OSError);
3537 return NULL;
3538 }
3539
3540 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003541 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003542 }
3543 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003544 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003545#ifndef HAVE_BROKEN_MBSTOWCS
3546 wlen = mbstowcs(NULL, str, 0);
3547#else
3548 wlen = len;
3549#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003550 if (wlen == (size_t)-1)
3551 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003552 if (wlen+1 <= smallbuf_len) {
3553 wstr = smallbuf;
3554 }
3555 else {
3556 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3557 return PyErr_NoMemory();
3558
3559 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3560 if (!wstr)
3561 return PyErr_NoMemory();
3562 }
3563
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003564 wlen2 = mbstowcs(wstr, str, wlen+1);
3565 if (wlen2 == (size_t)-1) {
3566 if (wstr != smallbuf)
3567 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003568 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003569 }
3570#ifdef HAVE_BROKEN_MBSTOWCS
3571 assert(wlen2 == wlen);
3572#endif
3573 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3574 if (wstr != smallbuf)
3575 PyMem_Free(wstr);
3576 }
3577 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003578
3579decode_error:
3580 errmsg = strerror(errno);
3581 assert(errmsg != NULL);
3582
3583 error_pos = mbstowcs_errorpos(str, len);
3584 if (errmsg != NULL) {
3585 size_t errlen;
3586 wstr = _Py_char2wchar(errmsg, &errlen);
3587 if (wstr != NULL) {
3588 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003589 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003590 } else
3591 errmsg = NULL;
3592 }
3593 if (errmsg == NULL)
3594 reason = PyUnicode_FromString(
3595 "mbstowcs() encountered an invalid multibyte sequence");
3596 if (reason == NULL)
3597 return NULL;
3598
3599 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3600 "locale", str, len,
3601 (Py_ssize_t)error_pos,
3602 (Py_ssize_t)(error_pos+1),
3603 reason);
3604 Py_DECREF(reason);
3605 if (exc != NULL) {
3606 PyCodec_StrictErrors(exc);
3607 Py_XDECREF(exc);
3608 }
3609 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003610}
3611
3612PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003613PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003614{
3615 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003616 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003617}
3618
3619
3620PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003621PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003622 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003623 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3624}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003625
Christian Heimes5894ba72007-11-04 11:43:14 +00003626PyObject*
3627PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3628{
Victor Stinner99b95382011-07-04 14:23:54 +02003629#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003630 return PyUnicode_DecodeMBCS(s, size, NULL);
3631#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003632 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003633#else
Victor Stinner793b5312011-04-27 00:24:21 +02003634 PyInterpreterState *interp = PyThreadState_GET()->interp;
3635 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3636 cannot use it to encode and decode filenames before it is loaded. Load
3637 the Python codec requires to encode at least its own filename. Use the C
3638 version of the locale codec until the codec registry is initialized and
3639 the Python codec is loaded.
3640
3641 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3642 cannot only rely on it: check also interp->fscodec_initialized for
3643 subinterpreters. */
3644 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003645 return PyUnicode_Decode(s, size,
3646 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003647 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003648 }
3649 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003650 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003651 }
Victor Stinnerad158722010-10-27 00:25:46 +00003652#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003653}
3654
Martin v. Löwis011e8422009-05-05 04:43:17 +00003655
3656int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003657_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003658{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003659 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003660
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003661 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003662 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003663 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3664 PyUnicode_GET_LENGTH(str), '\0', 1);
3665 if (pos == -1)
3666 return 0;
3667 else
3668 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003669}
3670
Antoine Pitrou13348842012-01-29 18:36:34 +01003671int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003672PyUnicode_FSConverter(PyObject* arg, void* addr)
3673{
3674 PyObject *output = NULL;
3675 Py_ssize_t size;
3676 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003677 if (arg == NULL) {
3678 Py_DECREF(*(PyObject**)addr);
3679 return 1;
3680 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003681 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003682 output = arg;
3683 Py_INCREF(output);
3684 }
3685 else {
3686 arg = PyUnicode_FromObject(arg);
3687 if (!arg)
3688 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003689 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003690 Py_DECREF(arg);
3691 if (!output)
3692 return 0;
3693 if (!PyBytes_Check(output)) {
3694 Py_DECREF(output);
3695 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3696 return 0;
3697 }
3698 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003699 size = PyBytes_GET_SIZE(output);
3700 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003701 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003702 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003703 Py_DECREF(output);
3704 return 0;
3705 }
3706 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003707 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003708}
3709
3710
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003711int
3712PyUnicode_FSDecoder(PyObject* arg, void* addr)
3713{
3714 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003715 if (arg == NULL) {
3716 Py_DECREF(*(PyObject**)addr);
3717 return 1;
3718 }
3719 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003720 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003722 output = arg;
3723 Py_INCREF(output);
3724 }
3725 else {
3726 arg = PyBytes_FromObject(arg);
3727 if (!arg)
3728 return 0;
3729 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3730 PyBytes_GET_SIZE(arg));
3731 Py_DECREF(arg);
3732 if (!output)
3733 return 0;
3734 if (!PyUnicode_Check(output)) {
3735 Py_DECREF(output);
3736 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3737 return 0;
3738 }
3739 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003740 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003741 Py_DECREF(output);
3742 return 0;
3743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003745 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003746 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3747 Py_DECREF(output);
3748 return 0;
3749 }
3750 *(PyObject**)addr = output;
3751 return Py_CLEANUP_SUPPORTED;
3752}
3753
3754
Martin v. Löwis5b222132007-06-10 09:51:05 +00003755char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003757{
Christian Heimesf3863112007-11-22 07:46:41 +00003758 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003760 if (!PyUnicode_Check(unicode)) {
3761 PyErr_BadArgument();
3762 return NULL;
3763 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003764 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003765 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003767 if (PyUnicode_UTF8(unicode) == NULL) {
3768 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3770 if (bytes == NULL)
3771 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3773 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003774 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775 Py_DECREF(bytes);
3776 return NULL;
3777 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003778 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3779 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3780 PyBytes_AS_STRING(bytes),
3781 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 Py_DECREF(bytes);
3783 }
3784
3785 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003786 *psize = PyUnicode_UTF8_LENGTH(unicode);
3787 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003788}
3789
3790char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3794}
3795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796Py_UNICODE *
3797PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799 const unsigned char *one_byte;
3800#if SIZEOF_WCHAR_T == 4
3801 const Py_UCS2 *two_bytes;
3802#else
3803 const Py_UCS4 *four_bytes;
3804 const Py_UCS4 *ucs4_end;
3805 Py_ssize_t num_surrogates;
3806#endif
3807 wchar_t *w;
3808 wchar_t *wchar_end;
3809
3810 if (!PyUnicode_Check(unicode)) {
3811 PyErr_BadArgument();
3812 return NULL;
3813 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003814 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003816 assert(_PyUnicode_KIND(unicode) != 0);
3817 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003819 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003821 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3822 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823 num_surrogates = 0;
3824
3825 for (; four_bytes < ucs4_end; ++four_bytes) {
3826 if (*four_bytes > 0xFFFF)
3827 ++num_surrogates;
3828 }
3829
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3831 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3832 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833 PyErr_NoMemory();
3834 return NULL;
3835 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003836 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003838 w = _PyUnicode_WSTR(unicode);
3839 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3840 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3842 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003843 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003845 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3846 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 }
3848 else
3849 *w = *four_bytes;
3850
3851 if (w > wchar_end) {
3852 assert(0 && "Miscalculated string end");
3853 }
3854 }
3855 *w = 0;
3856#else
3857 /* sizeof(wchar_t) == 4 */
3858 Py_FatalError("Impossible unicode object state, wstr and str "
3859 "should share memory already.");
3860 return NULL;
3861#endif
3862 }
3863 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003864 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3865 (_PyUnicode_LENGTH(unicode) + 1));
3866 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 PyErr_NoMemory();
3868 return NULL;
3869 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003870 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3871 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3872 w = _PyUnicode_WSTR(unicode);
3873 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003875 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3876 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 for (; w < wchar_end; ++one_byte, ++w)
3878 *w = *one_byte;
3879 /* null-terminate the wstr */
3880 *w = 0;
3881 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003884 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 for (; w < wchar_end; ++two_bytes, ++w)
3886 *w = *two_bytes;
3887 /* null-terminate the wstr */
3888 *w = 0;
3889#else
3890 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 PyObject_FREE(_PyUnicode_WSTR(unicode));
3892 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 Py_FatalError("Impossible unicode object state, wstr "
3894 "and str should share memory already.");
3895 return NULL;
3896#endif
3897 }
3898 else {
3899 assert(0 && "This should never happen.");
3900 }
3901 }
3902 }
3903 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003904 *size = PyUnicode_WSTR_LENGTH(unicode);
3905 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003906}
3907
Alexander Belopolsky40018472011-02-26 01:02:56 +00003908Py_UNICODE *
3909PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912}
3913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914
Alexander Belopolsky40018472011-02-26 01:02:56 +00003915Py_ssize_t
3916PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917{
3918 if (!PyUnicode_Check(unicode)) {
3919 PyErr_BadArgument();
3920 goto onError;
3921 }
3922 return PyUnicode_GET_SIZE(unicode);
3923
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 return -1;
3926}
3927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928Py_ssize_t
3929PyUnicode_GetLength(PyObject *unicode)
3930{
Victor Stinner07621332012-06-16 04:53:46 +02003931 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003932 PyErr_BadArgument();
3933 return -1;
3934 }
Victor Stinner07621332012-06-16 04:53:46 +02003935 if (PyUnicode_READY(unicode) == -1)
3936 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937 return PyUnicode_GET_LENGTH(unicode);
3938}
3939
3940Py_UCS4
3941PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3942{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003943 void *data;
3944 int kind;
3945
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003946 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3947 PyErr_BadArgument();
3948 return (Py_UCS4)-1;
3949 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003950 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003951 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 return (Py_UCS4)-1;
3953 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003954 data = PyUnicode_DATA(unicode);
3955 kind = PyUnicode_KIND(unicode);
3956 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957}
3958
3959int
3960PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3961{
3962 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003963 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 return -1;
3965 }
Victor Stinner488fa492011-12-12 00:01:39 +01003966 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003967 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003968 PyErr_SetString(PyExc_IndexError, "string index out of range");
3969 return -1;
3970 }
Victor Stinner488fa492011-12-12 00:01:39 +01003971 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003972 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003973 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3974 PyErr_SetString(PyExc_ValueError, "character out of range");
3975 return -1;
3976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3978 index, ch);
3979 return 0;
3980}
3981
Alexander Belopolsky40018472011-02-26 01:02:56 +00003982const char *
3983PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003984{
Victor Stinner42cb4622010-09-01 19:39:01 +00003985 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003986}
3987
Victor Stinner554f3f02010-06-16 23:33:54 +00003988/* create or adjust a UnicodeDecodeError */
3989static void
3990make_decode_exception(PyObject **exceptionObject,
3991 const char *encoding,
3992 const char *input, Py_ssize_t length,
3993 Py_ssize_t startpos, Py_ssize_t endpos,
3994 const char *reason)
3995{
3996 if (*exceptionObject == NULL) {
3997 *exceptionObject = PyUnicodeDecodeError_Create(
3998 encoding, input, length, startpos, endpos, reason);
3999 }
4000 else {
4001 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4002 goto onError;
4003 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4004 goto onError;
4005 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4006 goto onError;
4007 }
4008 return;
4009
4010onError:
4011 Py_DECREF(*exceptionObject);
4012 *exceptionObject = NULL;
4013}
4014
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004015#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016/* error handling callback helper:
4017 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004018 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004019 and adjust various state variables.
4020 return 0 on success, -1 on error
4021*/
4022
Alexander Belopolsky40018472011-02-26 01:02:56 +00004023static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004024unicode_decode_call_errorhandler_wchar(
4025 const char *errors, PyObject **errorHandler,
4026 const char *encoding, const char *reason,
4027 const char **input, const char **inend, Py_ssize_t *startinpos,
4028 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4029 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004031 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032
4033 PyObject *restuple = NULL;
4034 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004035 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004036 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004037 Py_ssize_t requiredsize;
4038 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004039 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004040 wchar_t *repwstr;
4041 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004043 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4044 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004045
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 *errorHandler = PyCodec_LookupError(errors);
4048 if (*errorHandler == NULL)
4049 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 }
4051
Victor Stinner554f3f02010-06-16 23:33:54 +00004052 make_decode_exception(exceptionObject,
4053 encoding,
4054 *input, *inend - *input,
4055 *startinpos, *endinpos,
4056 reason);
4057 if (*exceptionObject == NULL)
4058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059
4060 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4061 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004064 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 }
4067 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004068 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004069
4070 /* Copy back the bytes variables, which might have been modified by the
4071 callback */
4072 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4073 if (!inputobj)
4074 goto onError;
4075 if (!PyBytes_Check(inputobj)) {
4076 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4077 }
4078 *input = PyBytes_AS_STRING(inputobj);
4079 insize = PyBytes_GET_SIZE(inputobj);
4080 *inend = *input + insize;
4081 /* we can DECREF safely, as the exception has another reference,
4082 so the object won't go away. */
4083 Py_DECREF(inputobj);
4084
4085 if (newpos<0)
4086 newpos = insize+newpos;
4087 if (newpos<0 || newpos>insize) {
4088 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4089 goto onError;
4090 }
4091
4092 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4093 if (repwstr == NULL)
4094 goto onError;
4095 /* need more space? (at least enough for what we
4096 have+the replacement+the rest of the string (starting
4097 at the new input position), so we won't have to check space
4098 when there are no errors in the rest of the string) */
4099 requiredsize = *outpos + repwlen + insize-newpos;
4100 if (requiredsize > outsize) {
4101 if (requiredsize < 2*outsize)
4102 requiredsize = 2*outsize;
4103 if (unicode_resize(output, requiredsize) < 0)
4104 goto onError;
4105 }
4106 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4107 *outpos += repwlen;
4108
4109 *endinpos = newpos;
4110 *inptr = *input + newpos;
4111
4112 /* we made it! */
4113 Py_XDECREF(restuple);
4114 return 0;
4115
4116 onError:
4117 Py_XDECREF(restuple);
4118 return -1;
4119}
4120#endif /* HAVE_MBCS */
4121
4122static int
4123unicode_decode_call_errorhandler_writer(
4124 const char *errors, PyObject **errorHandler,
4125 const char *encoding, const char *reason,
4126 const char **input, const char **inend, Py_ssize_t *startinpos,
4127 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4128 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4129{
4130 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4131
4132 PyObject *restuple = NULL;
4133 PyObject *repunicode = NULL;
4134 Py_ssize_t insize;
4135 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004136 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004137 PyObject *inputobj = NULL;
4138
4139 if (*errorHandler == NULL) {
4140 *errorHandler = PyCodec_LookupError(errors);
4141 if (*errorHandler == NULL)
4142 goto onError;
4143 }
4144
4145 make_decode_exception(exceptionObject,
4146 encoding,
4147 *input, *inend - *input,
4148 *startinpos, *endinpos,
4149 reason);
4150 if (*exceptionObject == NULL)
4151 goto onError;
4152
4153 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4154 if (restuple == NULL)
4155 goto onError;
4156 if (!PyTuple_Check(restuple)) {
4157 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4158 goto onError;
4159 }
4160 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004161 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004162
4163 /* Copy back the bytes variables, which might have been modified by the
4164 callback */
4165 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4166 if (!inputobj)
4167 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004168 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004170 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004171 *input = PyBytes_AS_STRING(inputobj);
4172 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004173 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004174 /* we can DECREF safely, as the exception has another reference,
4175 so the object won't go away. */
4176 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004177
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004180 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4182 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004183 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184
Victor Stinner8f674cc2013-04-17 23:02:17 +02004185 if (PyUnicode_READY(repunicode) < 0)
4186 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004187 replen = PyUnicode_GET_LENGTH(repunicode);
4188 writer->min_length += replen;
4189 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004190 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004191 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004192 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004195 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004196
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198 Py_XDECREF(restuple);
4199 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004203 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204}
4205
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004206/* --- UTF-7 Codec -------------------------------------------------------- */
4207
Antoine Pitrou244651a2009-05-04 18:56:13 +00004208/* See RFC2152 for details. We encode conservatively and decode liberally. */
4209
4210/* Three simple macros defining base-64. */
4211
4212/* Is c a base-64 character? */
4213
4214#define IS_BASE64(c) \
4215 (((c) >= 'A' && (c) <= 'Z') || \
4216 ((c) >= 'a' && (c) <= 'z') || \
4217 ((c) >= '0' && (c) <= '9') || \
4218 (c) == '+' || (c) == '/')
4219
4220/* given that c is a base-64 character, what is its base-64 value? */
4221
4222#define FROM_BASE64(c) \
4223 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4224 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4225 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4226 (c) == '+' ? 62 : 63)
4227
4228/* What is the base-64 character of the bottom 6 bits of n? */
4229
4230#define TO_BASE64(n) \
4231 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4232
4233/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4234 * decoded as itself. We are permissive on decoding; the only ASCII
4235 * byte not decoding to itself is the + which begins a base64
4236 * string. */
4237
4238#define DECODE_DIRECT(c) \
4239 ((c) <= 127 && (c) != '+')
4240
4241/* The UTF-7 encoder treats ASCII characters differently according to
4242 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4243 * the above). See RFC2152. This array identifies these different
4244 * sets:
4245 * 0 : "Set D"
4246 * alphanumeric and '(),-./:?
4247 * 1 : "Set O"
4248 * !"#$%&*;<=>@[]^_`{|}
4249 * 2 : "whitespace"
4250 * ht nl cr sp
4251 * 3 : special (must be base64 encoded)
4252 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4253 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004254
Tim Petersced69f82003-09-16 20:30:58 +00004255static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004256char utf7_category[128] = {
4257/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4258 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4259/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4260 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4261/* sp ! " # $ % & ' ( ) * + , - . / */
4262 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4263/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4265/* @ A B C D E F G H I J K L M N O */
4266 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4267/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4269/* ` a b c d e f g h i j k l m n o */
4270 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4271/* p q r s t u v w x y z { | } ~ del */
4272 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004273};
4274
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275/* ENCODE_DIRECT: this character should be encoded as itself. The
4276 * answer depends on whether we are encoding set O as itself, and also
4277 * on whether we are encoding whitespace as itself. RFC2152 makes it
4278 * clear that the answers to these questions vary between
4279 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004280
Antoine Pitrou244651a2009-05-04 18:56:13 +00004281#define ENCODE_DIRECT(c, directO, directWS) \
4282 ((c) < 128 && (c) > 0 && \
4283 ((utf7_category[(c)] == 0) || \
4284 (directWS && (utf7_category[(c)] == 2)) || \
4285 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286
Alexander Belopolsky40018472011-02-26 01:02:56 +00004287PyObject *
4288PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004289 Py_ssize_t size,
4290 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004291{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004292 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4293}
4294
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295/* The decoder. The only state we preserve is our read position,
4296 * i.e. how many characters we have consumed. So if we end in the
4297 * middle of a shift sequence we have to back off the read position
4298 * and the output to the beginning of the sequence, otherwise we lose
4299 * all the shift state (seen bits, number of bits seen, high
4300 * surrogate). */
4301
Alexander Belopolsky40018472011-02-26 01:02:56 +00004302PyObject *
4303PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004304 Py_ssize_t size,
4305 const char *errors,
4306 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004307{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004309 Py_ssize_t startinpos;
4310 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004311 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004312 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 const char *errmsg = "";
4314 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004315 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 unsigned int base64bits = 0;
4317 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004318 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 PyObject *errorHandler = NULL;
4320 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004322 if (size == 0) {
4323 if (consumed)
4324 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004325 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004326 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004329 _PyUnicodeWriter_Init(&writer);
4330 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331
4332 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333 e = s + size;
4334
4335 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004336 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004338 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004339
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340 if (inShift) { /* in a base-64 section */
4341 if (IS_BASE64(ch)) { /* consume a base-64 character */
4342 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4343 base64bits += 6;
4344 s++;
4345 if (base64bits >= 16) {
4346 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004347 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 base64bits -= 16;
4349 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004350 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 if (surrogate) {
4352 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004353 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4354 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004355 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004356 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004358 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
4360 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004361 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004362 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 }
4365 }
Victor Stinner551ac952011-11-29 22:58:13 +01004366 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 /* first surrogate */
4368 surrogate = outCh;
4369 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004371 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004372 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 }
4374 }
4375 }
4376 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 inShift = 0;
4378 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004380 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004381 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004382 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 if (base64bits > 0) { /* left-over bits */
4385 if (base64bits >= 6) {
4386 /* We've seen at least one base-64 character */
4387 errmsg = "partial character in shift sequence";
4388 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 else {
4391 /* Some bits remain; they should be zero */
4392 if (base64buffer != 0) {
4393 errmsg = "non-zero padding bits in shift sequence";
4394 goto utf7Error;
4395 }
4396 }
4397 }
4398 if (ch != '-') {
4399 /* '-' is absorbed; other terminating
4400 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004401 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004402 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 }
4405 }
4406 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 s++; /* consume '+' */
4409 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004411 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004412 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 }
4414 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004418 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 }
4420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004423 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 else {
4427 startinpos = s-starts;
4428 s++;
4429 errmsg = "unexpected special character";
4430 goto utf7Error;
4431 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004435 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 errors, &errorHandler,
4437 "utf7", errmsg,
4438 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004439 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441 }
4442
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443 /* end of string */
4444
4445 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4446 /* if we're in an inconsistent state, that's an error */
4447 if (surrogate ||
4448 (base64bits >= 6) ||
4449 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004451 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 errors, &errorHandler,
4453 "utf7", "unterminated shift sequence",
4454 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004455 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 goto onError;
4457 if (s < e)
4458 goto restart;
4459 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461
4462 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004463 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004465 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004466 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 }
4468 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004469 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 Py_XDECREF(errorHandler);
4474 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004475 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 Py_XDECREF(errorHandler);
4479 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004480 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481 return NULL;
4482}
4483
4484
Alexander Belopolsky40018472011-02-26 01:02:56 +00004485PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004486_PyUnicode_EncodeUTF7(PyObject *str,
4487 int base64SetO,
4488 int base64WhiteSpace,
4489 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004491 int kind;
4492 void *data;
4493 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004494 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004495 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004496 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 unsigned int base64bits = 0;
4498 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499 char * out;
4500 char * start;
4501
Benjamin Petersonbac79492012-01-14 13:34:47 -05004502 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503 return NULL;
4504 kind = PyUnicode_KIND(str);
4505 data = PyUnicode_DATA(str);
4506 len = PyUnicode_GET_LENGTH(str);
4507
4508 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004511 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004512 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004513 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004514 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515 if (v == NULL)
4516 return NULL;
4517
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004518 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004519 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004520 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521
Antoine Pitrou244651a2009-05-04 18:56:13 +00004522 if (inShift) {
4523 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4524 /* shifting out */
4525 if (base64bits) { /* output remaining bits */
4526 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4527 base64buffer = 0;
4528 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004529 }
4530 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 /* Characters not in the BASE64 set implicitly unshift the sequence
4532 so no '-' is required, except if the character is itself a '-' */
4533 if (IS_BASE64(ch) || ch == '-') {
4534 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 *out++ = (char) ch;
4537 }
4538 else {
4539 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004540 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 else { /* not in a shift sequence */
4543 if (ch == '+') {
4544 *out++ = '+';
4545 *out++ = '-';
4546 }
4547 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4548 *out++ = (char) ch;
4549 }
4550 else {
4551 *out++ = '+';
4552 inShift = 1;
4553 goto encode_char;
4554 }
4555 }
4556 continue;
4557encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004559 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004560
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 /* code first surrogate */
4562 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004563 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 while (base64bits >= 6) {
4565 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4566 base64bits -= 6;
4567 }
4568 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004569 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 base64bits += 16;
4572 base64buffer = (base64buffer << 16) | ch;
4573 while (base64bits >= 6) {
4574 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4575 base64bits -= 6;
4576 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 if (base64bits)
4579 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4580 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004582 if (_PyBytes_Resize(&v, out - start) < 0)
4583 return NULL;
4584 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004586PyObject *
4587PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4588 Py_ssize_t size,
4589 int base64SetO,
4590 int base64WhiteSpace,
4591 const char *errors)
4592{
4593 PyObject *result;
4594 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4595 if (tmp == NULL)
4596 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004597 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004598 base64WhiteSpace, errors);
4599 Py_DECREF(tmp);
4600 return result;
4601}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603#undef IS_BASE64
4604#undef FROM_BASE64
4605#undef TO_BASE64
4606#undef DECODE_DIRECT
4607#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609/* --- UTF-8 Codec -------------------------------------------------------- */
4610
Alexander Belopolsky40018472011-02-26 01:02:56 +00004611PyObject *
4612PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004613 Py_ssize_t size,
4614 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615{
Walter Dörwald69652032004-09-07 20:24:22 +00004616 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4617}
4618
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004619#include "stringlib/asciilib.h"
4620#include "stringlib/codecs.h"
4621#include "stringlib/undef.h"
4622
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004623#include "stringlib/ucs1lib.h"
4624#include "stringlib/codecs.h"
4625#include "stringlib/undef.h"
4626
4627#include "stringlib/ucs2lib.h"
4628#include "stringlib/codecs.h"
4629#include "stringlib/undef.h"
4630
4631#include "stringlib/ucs4lib.h"
4632#include "stringlib/codecs.h"
4633#include "stringlib/undef.h"
4634
Antoine Pitrouab868312009-01-10 15:40:25 +00004635/* Mask to quickly check whether a C 'long' contains a
4636 non-ASCII, UTF8-encoded char. */
4637#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004638# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004639#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004640# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004641#else
4642# error C 'long' size should be either 4 or 8!
4643#endif
4644
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004645static Py_ssize_t
4646ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004647{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004649 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004651 /*
4652 * Issue #17237: m68k is a bit different from most architectures in
4653 * that objects do not use "natural alignment" - for example, int and
4654 * long are only aligned at 2-byte boundaries. Therefore the assert()
4655 * won't work; also, tests have shown that skipping the "optimised
4656 * version" will even speed up m68k.
4657 */
4658#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004659#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004660 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4661 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662 /* Fast path, see in STRINGLIB(utf8_decode) for
4663 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004664 /* Help allocation */
4665 const char *_p = p;
4666 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 while (_p < aligned_end) {
4668 unsigned long value = *(const unsigned long *) _p;
4669 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671 *((unsigned long *)q) = value;
4672 _p += SIZEOF_LONG;
4673 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004674 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004675 p = _p;
4676 while (p < end) {
4677 if ((unsigned char)*p & 0x80)
4678 break;
4679 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004684#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685 while (p < end) {
4686 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4687 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004688 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004689 /* Help allocation */
4690 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004691 while (_p < aligned_end) {
4692 unsigned long value = *(unsigned long *) _p;
4693 if (value & ASCII_CHAR_MASK)
4694 break;
4695 _p += SIZEOF_LONG;
4696 }
4697 p = _p;
4698 if (_p == end)
4699 break;
4700 }
4701 if ((unsigned char)*p & 0x80)
4702 break;
4703 ++p;
4704 }
4705 memcpy(dest, start, p - start);
4706 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707}
Antoine Pitrouab868312009-01-10 15:40:25 +00004708
Victor Stinner785938e2011-12-11 20:09:03 +01004709PyObject *
4710PyUnicode_DecodeUTF8Stateful(const char *s,
4711 Py_ssize_t size,
4712 const char *errors,
4713 Py_ssize_t *consumed)
4714{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004715 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004716 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004717 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718
4719 Py_ssize_t startinpos;
4720 Py_ssize_t endinpos;
4721 const char *errmsg = "";
4722 PyObject *errorHandler = NULL;
4723 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004724
4725 if (size == 0) {
4726 if (consumed)
4727 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004728 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004729 }
4730
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004731 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4732 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004733 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 *consumed = 1;
4735 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004736 }
4737
Victor Stinner8f674cc2013-04-17 23:02:17 +02004738 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004739 writer.min_length = size;
4740 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004741 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004742
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004743 writer.pos = ascii_decode(s, end, writer.data);
4744 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 while (s < end) {
4746 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004747 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004749 if (PyUnicode_IS_ASCII(writer.buffer))
4750 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004752 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004754 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004755 } else {
4756 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004757 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 }
4759
4760 switch (ch) {
4761 case 0:
4762 if (s == end || consumed)
4763 goto End;
4764 errmsg = "unexpected end of data";
4765 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004766 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004767 break;
4768 case 1:
4769 errmsg = "invalid start byte";
4770 startinpos = s - starts;
4771 endinpos = startinpos + 1;
4772 break;
4773 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004774 case 3:
4775 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776 errmsg = "invalid continuation byte";
4777 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004778 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 break;
4780 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004781 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 goto onError;
4783 continue;
4784 }
4785
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004786 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787 errors, &errorHandler,
4788 "utf-8", errmsg,
4789 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004790 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004791 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004792 }
4793
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004795 if (consumed)
4796 *consumed = s - starts;
4797
4798 Py_XDECREF(errorHandler);
4799 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004800 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004801
4802onError:
4803 Py_XDECREF(errorHandler);
4804 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004805 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004806 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004807}
4808
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004809#ifdef __APPLE__
4810
4811/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004812 used to decode the command line arguments on Mac OS X.
4813
4814 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004815 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004816
4817wchar_t*
4818_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4819{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004820 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821 wchar_t *unicode;
4822 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823
4824 /* Note: size will always be longer than the resulting Unicode
4825 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004826 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004827 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004828 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829 if (!unicode)
4830 return NULL;
4831
4832 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004836 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004837#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004839#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004840 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004842 if (ch > 0xFF) {
4843#if SIZEOF_WCHAR_T == 4
4844 assert(0);
4845#else
4846 assert(Py_UNICODE_IS_SURROGATE(ch));
4847 /* compute and append the two surrogates: */
4848 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4849 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4850#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004851 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 else {
4853 if (!ch && s == e)
4854 break;
4855 /* surrogateescape */
4856 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4857 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004858 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004860 return unicode;
4861}
4862
4863#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004865/* Primary internal function which creates utf8 encoded bytes objects.
4866
4867 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004868 and allocate exactly as much space needed at the end. Else allocate the
4869 maximum possible needed (4 result bytes per Unicode character), and return
4870 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004871*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004872PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004873_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Victor Stinner6099a032011-12-18 14:22:26 +01004875 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876 void *data;
4877 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879 if (!PyUnicode_Check(unicode)) {
4880 PyErr_BadArgument();
4881 return NULL;
4882 }
4883
4884 if (PyUnicode_READY(unicode) == -1)
4885 return NULL;
4886
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004887 if (PyUnicode_UTF8(unicode))
4888 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4889 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890
4891 kind = PyUnicode_KIND(unicode);
4892 data = PyUnicode_DATA(unicode);
4893 size = PyUnicode_GET_LENGTH(unicode);
4894
Benjamin Petersonead6b532011-12-20 17:23:42 -06004895 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004896 default:
4897 assert(0);
4898 case PyUnicode_1BYTE_KIND:
4899 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4900 assert(!PyUnicode_IS_ASCII(unicode));
4901 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4902 case PyUnicode_2BYTE_KIND:
4903 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4904 case PyUnicode_4BYTE_KIND:
4905 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907}
4908
Alexander Belopolsky40018472011-02-26 01:02:56 +00004909PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004910PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4911 Py_ssize_t size,
4912 const char *errors)
4913{
4914 PyObject *v, *unicode;
4915
4916 unicode = PyUnicode_FromUnicode(s, size);
4917 if (unicode == NULL)
4918 return NULL;
4919 v = _PyUnicode_AsUTF8String(unicode, errors);
4920 Py_DECREF(unicode);
4921 return v;
4922}
4923
4924PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004925PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004927 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928}
4929
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930/* --- UTF-32 Codec ------------------------------------------------------- */
4931
4932PyObject *
4933PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 Py_ssize_t size,
4935 const char *errors,
4936 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937{
4938 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4939}
4940
4941PyObject *
4942PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 Py_ssize_t size,
4944 const char *errors,
4945 int *byteorder,
4946 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947{
4948 const char *starts = s;
4949 Py_ssize_t startinpos;
4950 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004951 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004952 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004953 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955 PyObject *errorHandler = NULL;
4956 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004957
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 q = (unsigned char *)s;
4959 e = q + size;
4960
4961 if (byteorder)
4962 bo = *byteorder;
4963
4964 /* Check for BOM marks (U+FEFF) in the input and adjust current
4965 byte order setting accordingly. In native mode, the leading BOM
4966 mark is skipped, in all other modes, it is copied to the output
4967 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004968 if (bo == 0 && size >= 4) {
4969 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4970 if (bom == 0x0000FEFF) {
4971 bo = -1;
4972 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004974 else if (bom == 0xFFFE0000) {
4975 bo = 1;
4976 q += 4;
4977 }
4978 if (byteorder)
4979 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980 }
4981
Victor Stinnere64322e2012-10-30 23:12:47 +01004982 if (q == e) {
4983 if (consumed)
4984 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004985 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986 }
4987
Victor Stinnere64322e2012-10-30 23:12:47 +01004988#ifdef WORDS_BIGENDIAN
4989 le = bo < 0;
4990#else
4991 le = bo <= 0;
4992#endif
4993
Victor Stinner8f674cc2013-04-17 23:02:17 +02004994 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004995 writer.min_length = (e - q + 3) / 4;
4996 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004998
Victor Stinnere64322e2012-10-30 23:12:47 +01004999 while (1) {
5000 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005001 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005002
Victor Stinnere64322e2012-10-30 23:12:47 +01005003 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005004 enum PyUnicode_Kind kind = writer.kind;
5005 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005006 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005007 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 if (le) {
5009 do {
5010 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5011 if (ch > maxch)
5012 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005013 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005014 q += 4;
5015 } while (q <= last);
5016 }
5017 else {
5018 do {
5019 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5020 if (ch > maxch)
5021 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005022 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005023 q += 4;
5024 } while (q <= last);
5025 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005026 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005027 }
5028
5029 if (ch <= maxch) {
5030 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005032 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 startinpos = ((const char *)q) - starts;
5035 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005037 else {
5038 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005039 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005040 goto onError;
5041 q += 4;
5042 continue;
5043 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005045 startinpos = ((const char *)q) - starts;
5046 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005048
5049 /* The remaining input chars are ignored if the callback
5050 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005051 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 errors, &errorHandler,
5053 "utf32", errmsg,
5054 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005055 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 }
5058
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062 Py_XDECREF(errorHandler);
5063 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005064 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005067 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068 Py_XDECREF(errorHandler);
5069 Py_XDECREF(exc);
5070 return NULL;
5071}
5072
5073PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005074_PyUnicode_EncodeUTF32(PyObject *str,
5075 const char *errors,
5076 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005078 int kind;
5079 void *data;
5080 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005081 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005083 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005085#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086 int iorder[] = {0, 1, 2, 3};
5087#else
5088 int iorder[] = {3, 2, 1, 0};
5089#endif
5090
Benjamin Peterson29060642009-01-31 22:14:21 +00005091#define STORECHAR(CH) \
5092 do { \
5093 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5094 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5095 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5096 p[iorder[0]] = (CH) & 0xff; \
5097 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 } while(0)
5099
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005100 if (!PyUnicode_Check(str)) {
5101 PyErr_BadArgument();
5102 return NULL;
5103 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005104 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005105 return NULL;
5106 kind = PyUnicode_KIND(str);
5107 data = PyUnicode_DATA(str);
5108 len = PyUnicode_GET_LENGTH(str);
5109
5110 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005111 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005113 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114 if (v == NULL)
5115 return NULL;
5116
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005117 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005121 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005122
5123 if (byteorder == -1) {
5124 /* force LE */
5125 iorder[0] = 0;
5126 iorder[1] = 1;
5127 iorder[2] = 2;
5128 iorder[3] = 3;
5129 }
5130 else if (byteorder == 1) {
5131 /* force BE */
5132 iorder[0] = 3;
5133 iorder[1] = 2;
5134 iorder[2] = 1;
5135 iorder[3] = 0;
5136 }
5137
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005138 for (i = 0; i < len; i++)
5139 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005140
5141 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005142 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143#undef STORECHAR
5144}
5145
Alexander Belopolsky40018472011-02-26 01:02:56 +00005146PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005147PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5148 Py_ssize_t size,
5149 const char *errors,
5150 int byteorder)
5151{
5152 PyObject *result;
5153 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5154 if (tmp == NULL)
5155 return NULL;
5156 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5157 Py_DECREF(tmp);
5158 return result;
5159}
5160
5161PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005162PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163{
Victor Stinnerb960b342011-11-20 19:12:52 +01005164 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005165}
5166
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167/* --- UTF-16 Codec ------------------------------------------------------- */
5168
Tim Peters772747b2001-08-09 22:21:55 +00005169PyObject *
5170PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 Py_ssize_t size,
5172 const char *errors,
5173 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174{
Walter Dörwald69652032004-09-07 20:24:22 +00005175 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5176}
5177
5178PyObject *
5179PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 Py_ssize_t size,
5181 const char *errors,
5182 int *byteorder,
5183 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005186 Py_ssize_t startinpos;
5187 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005188 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005189 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005190 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005191 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005192 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193 PyObject *errorHandler = NULL;
5194 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195
Tim Peters772747b2001-08-09 22:21:55 +00005196 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005197 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198
5199 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005200 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005202 /* Check for BOM marks (U+FEFF) in the input and adjust current
5203 byte order setting accordingly. In native mode, the leading BOM
5204 mark is skipped, in all other modes, it is copied to the output
5205 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005206 if (bo == 0 && size >= 2) {
5207 const Py_UCS4 bom = (q[1] << 8) | q[0];
5208 if (bom == 0xFEFF) {
5209 q += 2;
5210 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005212 else if (bom == 0xFFFE) {
5213 q += 2;
5214 bo = 1;
5215 }
5216 if (byteorder)
5217 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
Antoine Pitrou63065d72012-05-15 23:48:04 +02005220 if (q == e) {
5221 if (consumed)
5222 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005223 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005224 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005225
Christian Heimes743e0cd2012-10-17 23:52:17 +02005226#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005227 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005228#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005229 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005230#endif
Tim Peters772747b2001-08-09 22:21:55 +00005231
Antoine Pitrou63065d72012-05-15 23:48:04 +02005232 /* Note: size will always be longer than the resulting Unicode
5233 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005234 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005235 writer.min_length = (e - q + 1) / 2;
5236 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005237 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005238
Antoine Pitrou63065d72012-05-15 23:48:04 +02005239 while (1) {
5240 Py_UCS4 ch = 0;
5241 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005242 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005243 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005245 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005246 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005247 native_ordering);
5248 else
5249 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005250 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005251 native_ordering);
5252 } else if (kind == PyUnicode_2BYTE_KIND) {
5253 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005254 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005255 native_ordering);
5256 } else {
5257 assert(kind == PyUnicode_4BYTE_KIND);
5258 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005259 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005260 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005261 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005262 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005263
Antoine Pitrou63065d72012-05-15 23:48:04 +02005264 switch (ch)
5265 {
5266 case 0:
5267 /* remaining byte at the end? (size should be even) */
5268 if (q == e || consumed)
5269 goto End;
5270 errmsg = "truncated data";
5271 startinpos = ((const char *)q) - starts;
5272 endinpos = ((const char *)e) - starts;
5273 break;
5274 /* The remaining input chars are ignored if the callback
5275 chooses to skip the input */
5276 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005277 q -= 2;
5278 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005279 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005280 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005281 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005282 endinpos = ((const char *)e) - starts;
5283 break;
5284 case 2:
5285 errmsg = "illegal encoding";
5286 startinpos = ((const char *)q) - 2 - starts;
5287 endinpos = startinpos + 2;
5288 break;
5289 case 3:
5290 errmsg = "illegal UTF-16 surrogate";
5291 startinpos = ((const char *)q) - 4 - starts;
5292 endinpos = startinpos + 2;
5293 break;
5294 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005295 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005296 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 continue;
5298 }
5299
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005301 errors,
5302 &errorHandler,
5303 "utf16", errmsg,
5304 &starts,
5305 (const char **)&e,
5306 &startinpos,
5307 &endinpos,
5308 &exc,
5309 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005310 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 }
5313
Antoine Pitrou63065d72012-05-15 23:48:04 +02005314End:
Walter Dörwald69652032004-09-07 20:24:22 +00005315 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005317
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005318 Py_XDECREF(errorHandler);
5319 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005320 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005323 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 Py_XDECREF(errorHandler);
5325 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 return NULL;
5327}
5328
Tim Peters772747b2001-08-09 22:21:55 +00005329PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005330_PyUnicode_EncodeUTF16(PyObject *str,
5331 const char *errors,
5332 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005334 enum PyUnicode_Kind kind;
5335 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005336 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005337 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005338 unsigned short *out;
5339 Py_ssize_t bytesize;
5340 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005341#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005342 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005343#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005344 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005345#endif
5346
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005347 if (!PyUnicode_Check(str)) {
5348 PyErr_BadArgument();
5349 return NULL;
5350 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005351 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005352 return NULL;
5353 kind = PyUnicode_KIND(str);
5354 data = PyUnicode_DATA(str);
5355 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005356
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005357 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005358 if (kind == PyUnicode_4BYTE_KIND) {
5359 const Py_UCS4 *in = (const Py_UCS4 *)data;
5360 const Py_UCS4 *end = in + len;
5361 while (in < end)
5362 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005363 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005364 }
5365 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005367 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005368 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 if (v == NULL)
5370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005372 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005373 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005374 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005376 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005377 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005378 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005379
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005380 switch (kind) {
5381 case PyUnicode_1BYTE_KIND: {
5382 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5383 break;
Tim Peters772747b2001-08-09 22:21:55 +00005384 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005385 case PyUnicode_2BYTE_KIND: {
5386 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5387 break;
Tim Peters772747b2001-08-09 22:21:55 +00005388 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005389 case PyUnicode_4BYTE_KIND: {
5390 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5391 break;
5392 }
5393 default:
5394 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005395 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005396
5397 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005398 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399}
5400
Alexander Belopolsky40018472011-02-26 01:02:56 +00005401PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005402PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5403 Py_ssize_t size,
5404 const char *errors,
5405 int byteorder)
5406{
5407 PyObject *result;
5408 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5409 if (tmp == NULL)
5410 return NULL;
5411 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5412 Py_DECREF(tmp);
5413 return result;
5414}
5415
5416PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005417PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005419 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420}
5421
5422/* --- Unicode Escape Codec ----------------------------------------------- */
5423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005424/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5425 if all the escapes in the string make it still a valid ASCII string.
5426 Returns -1 if any escapes were found which cause the string to
5427 pop out of ASCII range. Otherwise returns the length of the
5428 required buffer to hold the string.
5429 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005430static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5432{
5433 const unsigned char *p = (const unsigned char *)s;
5434 const unsigned char *end = p + size;
5435 Py_ssize_t length = 0;
5436
5437 if (size < 0)
5438 return -1;
5439
5440 for (; p < end; ++p) {
5441 if (*p > 127) {
5442 /* Non-ASCII */
5443 return -1;
5444 }
5445 else if (*p != '\\') {
5446 /* Normal character */
5447 ++length;
5448 }
5449 else {
5450 /* Backslash-escape, check next char */
5451 ++p;
5452 /* Escape sequence reaches till end of string or
5453 non-ASCII follow-up. */
5454 if (p >= end || *p > 127)
5455 return -1;
5456 switch (*p) {
5457 case '\n':
5458 /* backslash + \n result in zero characters */
5459 break;
5460 case '\\': case '\'': case '\"':
5461 case 'b': case 'f': case 't':
5462 case 'n': case 'r': case 'v': case 'a':
5463 ++length;
5464 break;
5465 case '0': case '1': case '2': case '3':
5466 case '4': case '5': case '6': case '7':
5467 case 'x': case 'u': case 'U': case 'N':
5468 /* these do not guarantee ASCII characters */
5469 return -1;
5470 default:
5471 /* count the backslash + the other character */
5472 length += 2;
5473 }
5474 }
5475 }
5476 return length;
5477}
5478
Fredrik Lundh06d12682001-01-24 07:59:11 +00005479static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005480
Alexander Belopolsky40018472011-02-26 01:02:56 +00005481PyObject *
5482PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005483 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005484 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005487 Py_ssize_t startinpos;
5488 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005489 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005491 char* message;
5492 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 PyObject *errorHandler = NULL;
5494 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005495 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005496
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005497 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005498 if (len == 0)
5499 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005500
5501 /* After length_of_escaped_ascii_string() there are two alternatives,
5502 either the string is pure ASCII with named escapes like \n, etc.
5503 and we determined it's exact size (common case)
5504 or it contains \x, \u, ... escape sequences. then we create a
5505 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005506 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005507 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005508 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509 }
5510 else {
5511 /* Escaped strings will always be longer than the resulting
5512 Unicode string, so we start with size here and then reduce the
5513 length after conversion to the true value.
5514 (but if the error callback returns a long replacement string
5515 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005516 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517 }
5518
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005522
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 while (s < end) {
5524 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005525 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
5528 /* Non-escape characters are interpreted as Unicode ordinals */
5529 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005530 x = (unsigned char)*s;
5531 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005532 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005533 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 continue;
5535 }
5536
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 /* \ - Escapes */
5539 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005540 c = *s++;
5541 if (s > end)
5542 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005544 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005547#define WRITECHAR(ch) \
5548 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005549 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005550 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005551 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005552
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005554 case '\\': WRITECHAR('\\'); break;
5555 case '\'': WRITECHAR('\''); break;
5556 case '\"': WRITECHAR('\"'); break;
5557 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559 case 'f': WRITECHAR('\014'); break;
5560 case 't': WRITECHAR('\t'); break;
5561 case 'n': WRITECHAR('\n'); break;
5562 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005564 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005565 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005566 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 case '0': case '1': case '2': case '3':
5570 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005571 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005572 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005573 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005574 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005575 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005577 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 break;
5579
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 /* hex escapes */
5581 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005583 digits = 2;
5584 message = "truncated \\xXX escape";
5585 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005589 digits = 4;
5590 message = "truncated \\uXXXX escape";
5591 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005594 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005595 digits = 8;
5596 message = "truncated \\UXXXXXXXX escape";
5597 hexescape:
5598 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005599 if (end - s < digits) {
5600 /* count only hex digits */
5601 for (; s < end; ++s) {
5602 c = (unsigned char)*s;
5603 if (!Py_ISXDIGIT(c))
5604 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005605 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005606 goto error;
5607 }
5608 for (; digits--; ++s) {
5609 c = (unsigned char)*s;
5610 if (!Py_ISXDIGIT(c))
5611 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005612 chr = (chr<<4) & ~0xF;
5613 if (c >= '0' && c <= '9')
5614 chr += c - '0';
5615 else if (c >= 'a' && c <= 'f')
5616 chr += 10 + c - 'a';
5617 else
5618 chr += 10 + c - 'A';
5619 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005620 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005621 /* _decoding_error will have already written into the
5622 target buffer. */
5623 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005624 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005625 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005626 message = "illegal Unicode character";
5627 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005628 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005629 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005630 break;
5631
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005633 case 'N':
5634 message = "malformed \\N character escape";
5635 if (ucnhash_CAPI == NULL) {
5636 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005637 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5638 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005639 if (ucnhash_CAPI == NULL)
5640 goto ucnhashError;
5641 }
5642 if (*s == '{') {
5643 const char *start = s+1;
5644 /* look for the closing brace */
5645 while (*s != '}' && s < end)
5646 s++;
5647 if (s > start && s < end && *s == '}') {
5648 /* found a name. look it up in the unicode database */
5649 message = "unknown Unicode character name";
5650 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005651 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005652 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005653 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005654 goto store;
5655 }
5656 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005657 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005658
5659 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005660 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 message = "\\ at end of string";
5662 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005663 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005664 }
5665 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005666 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005667 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005668 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005669 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005671 continue;
5672
5673 error:
5674 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005675 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005676 errors, &errorHandler,
5677 "unicodeescape", message,
5678 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005679 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005680 goto onError;
5681 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005683#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005685 Py_XDECREF(errorHandler);
5686 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005688
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005690 PyErr_SetString(
5691 PyExc_UnicodeError,
5692 "\\N escapes not supported (can't load unicodedata module)"
5693 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 Py_XDECREF(errorHandler);
5696 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005697 return NULL;
5698
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005700 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 Py_XDECREF(errorHandler);
5702 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 return NULL;
5704}
5705
5706/* Return a Unicode-Escape string version of the Unicode object.
5707
5708 If quotes is true, the string is enclosed in u"" or u'' quotes as
5709 appropriate.
5710
5711*/
5712
Alexander Belopolsky40018472011-02-26 01:02:56 +00005713PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005714PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005716 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005717 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719 int kind;
5720 void *data;
5721 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722
Ezio Melottie7f90372012-10-05 03:33:31 +03005723 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005724 escape.
5725
Ezio Melottie7f90372012-10-05 03:33:31 +03005726 For UCS1 strings it's '\xxx', 4 bytes per source character.
5727 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5728 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005729 */
5730
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005731 if (!PyUnicode_Check(unicode)) {
5732 PyErr_BadArgument();
5733 return NULL;
5734 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005735 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 return NULL;
5737 len = PyUnicode_GET_LENGTH(unicode);
5738 kind = PyUnicode_KIND(unicode);
5739 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005740 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005741 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5742 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5743 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5744 }
5745
5746 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005747 return PyBytes_FromStringAndSize(NULL, 0);
5748
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005749 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005751
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005752 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 if (repr == NULL)
5757 return NULL;
5758
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005759 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005761 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005762 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005763
Walter Dörwald79e913e2007-05-12 11:08:06 +00005764 /* Escape backslashes */
5765 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 *p++ = '\\';
5767 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005768 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005769 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005770
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005771 /* Map 21-bit characters to '\U00xxxxxx' */
5772 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005773 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005774 *p++ = '\\';
5775 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005776 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5777 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5778 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5779 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5780 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5781 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5782 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5783 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005785 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005786
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005788 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 *p++ = '\\';
5790 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005791 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5792 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5793 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5794 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005796
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005797 /* Map special whitespace to '\t', \n', '\r' */
5798 else if (ch == '\t') {
5799 *p++ = '\\';
5800 *p++ = 't';
5801 }
5802 else if (ch == '\n') {
5803 *p++ = '\\';
5804 *p++ = 'n';
5805 }
5806 else if (ch == '\r') {
5807 *p++ = '\\';
5808 *p++ = 'r';
5809 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005810
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005811 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005812 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005814 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005815 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5816 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005817 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005818
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 /* Copy everything else as-is */
5820 else
5821 *p++ = (char) ch;
5822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005824 assert(p - PyBytes_AS_STRING(repr) > 0);
5825 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5826 return NULL;
5827 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828}
5829
Alexander Belopolsky40018472011-02-26 01:02:56 +00005830PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005831PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5832 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005834 PyObject *result;
5835 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5836 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005838 result = PyUnicode_AsUnicodeEscapeString(tmp);
5839 Py_DECREF(tmp);
5840 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841}
5842
5843/* --- Raw Unicode Escape Codec ------------------------------------------- */
5844
Alexander Belopolsky40018472011-02-26 01:02:56 +00005845PyObject *
5846PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005847 Py_ssize_t size,
5848 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005851 Py_ssize_t startinpos;
5852 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 const char *end;
5855 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 PyObject *errorHandler = NULL;
5857 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005858
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005859 if (size == 0)
5860 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 /* Escaped strings will always be longer than the resulting
5863 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 length after conversion to the true value. (But decoding error
5865 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005866 _PyUnicodeWriter_Init(&writer);
5867 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005868
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 end = s + size;
5870 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 unsigned char c;
5872 Py_UCS4 x;
5873 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005874 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 /* Non-escape characters are interpreted as Unicode ordinals */
5877 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005878 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005879 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005880 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 startinpos = s-starts;
5884
5885 /* \u-escapes are only interpreted iff the number of leading
5886 backslashes if odd */
5887 bs = s;
5888 for (;s < end;) {
5889 if (*s != '\\')
5890 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005891 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005892 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005893 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 }
5895 if (((s - bs) & 1) == 0 ||
5896 s >= end ||
5897 (*s != 'u' && *s != 'U')) {
5898 continue;
5899 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 count = *s=='u' ? 4 : 8;
5902 s++;
5903
5904 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 for (x = 0, i = 0; i < count; ++i, ++s) {
5906 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005907 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005909 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 errors, &errorHandler,
5911 "rawunicodeescape", "truncated \\uXXXX",
5912 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005913 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 goto onError;
5915 goto nextByte;
5916 }
5917 x = (x<<4) & ~0xF;
5918 if (c >= '0' && c <= '9')
5919 x += c - '0';
5920 else if (c >= 'a' && c <= 'f')
5921 x += 10 + c - 'a';
5922 else
5923 x += 10 + c - 'A';
5924 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005925 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005926 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005927 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005928 }
5929 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005930 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005931 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005932 errors, &errorHandler,
5933 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005935 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005937 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 nextByte:
5939 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941 Py_XDECREF(errorHandler);
5942 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005943 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005944
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005946 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 Py_XDECREF(errorHandler);
5948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 return NULL;
5950}
5951
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952
Alexander Belopolsky40018472011-02-26 01:02:56 +00005953PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005956 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 char *p;
5958 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005959 Py_ssize_t expandsize, pos;
5960 int kind;
5961 void *data;
5962 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005964 if (!PyUnicode_Check(unicode)) {
5965 PyErr_BadArgument();
5966 return NULL;
5967 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005968 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005969 return NULL;
5970 kind = PyUnicode_KIND(unicode);
5971 data = PyUnicode_DATA(unicode);
5972 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005973 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5974 bytes, and 1 byte characters 4. */
5975 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005976
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005977 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005979
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005980 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 if (repr == NULL)
5982 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005983 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005984 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005986 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005987 for (pos = 0; pos < len; pos++) {
5988 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 /* Map 32-bit characters to '\Uxxxxxxxx' */
5990 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005991 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005992 *p++ = '\\';
5993 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005994 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5995 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5996 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5997 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5998 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5999 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6000 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6001 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006002 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006004 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 *p++ = '\\';
6006 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006007 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6008 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6009 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6010 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 /* Copy everything else as-is */
6013 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 *p++ = (char) ch;
6015 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006016
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006017 assert(p > q);
6018 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006019 return NULL;
6020 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021}
6022
Alexander Belopolsky40018472011-02-26 01:02:56 +00006023PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6025 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027 PyObject *result;
6028 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6029 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006030 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6032 Py_DECREF(tmp);
6033 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034}
6035
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006036/* --- Unicode Internal Codec ------------------------------------------- */
6037
Alexander Belopolsky40018472011-02-26 01:02:56 +00006038PyObject *
6039_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006040 Py_ssize_t size,
6041 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006042{
6043 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006044 Py_ssize_t startinpos;
6045 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006046 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006047 const char *end;
6048 const char *reason;
6049 PyObject *errorHandler = NULL;
6050 PyObject *exc = NULL;
6051
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006052 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006053 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006054 1))
6055 return NULL;
6056
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006057 if (size == 0)
6058 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006059
Victor Stinner8f674cc2013-04-17 23:02:17 +02006060 _PyUnicodeWriter_Init(&writer);
6061 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6062 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006064 }
6065 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006066
Victor Stinner8f674cc2013-04-17 23:02:17 +02006067 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006068 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006069 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006070 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006071 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006072 endinpos = end-starts;
6073 reason = "truncated input";
6074 goto error;
6075 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006076 /* We copy the raw representation one byte at a time because the
6077 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006078 ((char *) &uch)[0] = s[0];
6079 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006080#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006081 ((char *) &uch)[2] = s[2];
6082 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006083#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006084 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006085#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006086 /* We have to sanity check the raw data, otherwise doom looms for
6087 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006088 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006089 endinpos = s - starts + Py_UNICODE_SIZE;
6090 reason = "illegal code point (> 0x10FFFF)";
6091 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006092 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006093#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006094 s += Py_UNICODE_SIZE;
6095#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006096 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006097 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006098 Py_UNICODE uch2;
6099 ((char *) &uch2)[0] = s[0];
6100 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006101 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006102 {
Victor Stinner551ac952011-11-29 22:58:13 +01006103 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006104 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006105 }
6106 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006107#endif
6108
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006109 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006110 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006111 continue;
6112
6113 error:
6114 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006115 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006116 errors, &errorHandler,
6117 "unicode_internal", reason,
6118 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006119 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006120 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121 }
6122
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006123 Py_XDECREF(errorHandler);
6124 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006125 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006126
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006128 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006129 Py_XDECREF(errorHandler);
6130 Py_XDECREF(exc);
6131 return NULL;
6132}
6133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134/* --- Latin-1 Codec ------------------------------------------------------ */
6135
Alexander Belopolsky40018472011-02-26 01:02:56 +00006136PyObject *
6137PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006138 Py_ssize_t size,
6139 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006142 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143}
6144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006146static void
6147make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006148 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006149 PyObject *unicode,
6150 Py_ssize_t startpos, Py_ssize_t endpos,
6151 const char *reason)
6152{
6153 if (*exceptionObject == NULL) {
6154 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006155 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006156 encoding, unicode, startpos, endpos, reason);
6157 }
6158 else {
6159 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6160 goto onError;
6161 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6162 goto onError;
6163 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6164 goto onError;
6165 return;
6166 onError:
6167 Py_DECREF(*exceptionObject);
6168 *exceptionObject = NULL;
6169 }
6170}
6171
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006172/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006173static void
6174raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006175 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006176 PyObject *unicode,
6177 Py_ssize_t startpos, Py_ssize_t endpos,
6178 const char *reason)
6179{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006180 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006181 encoding, unicode, startpos, endpos, reason);
6182 if (*exceptionObject != NULL)
6183 PyCodec_StrictErrors(*exceptionObject);
6184}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185
6186/* error handling callback helper:
6187 build arguments, call the callback and check the arguments,
6188 put the result into newpos and return the replacement string, which
6189 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006190static PyObject *
6191unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006192 PyObject **errorHandler,
6193 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006194 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006195 Py_ssize_t startpos, Py_ssize_t endpos,
6196 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006198 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006199 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006200 PyObject *restuple;
6201 PyObject *resunicode;
6202
6203 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006205 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 }
6208
Benjamin Petersonbac79492012-01-14 13:34:47 -05006209 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006210 return NULL;
6211 len = PyUnicode_GET_LENGTH(unicode);
6212
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006213 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006214 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006215 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217
6218 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006220 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006223 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 Py_DECREF(restuple);
6225 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006226 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006227 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 &resunicode, newpos)) {
6229 Py_DECREF(restuple);
6230 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006231 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006232 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6233 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6234 Py_DECREF(restuple);
6235 return NULL;
6236 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006237 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006238 *newpos = len + *newpos;
6239 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6241 Py_DECREF(restuple);
6242 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006243 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006244 Py_INCREF(resunicode);
6245 Py_DECREF(restuple);
6246 return resunicode;
6247}
6248
Alexander Belopolsky40018472011-02-26 01:02:56 +00006249static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006250unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006251 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006252 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006253{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006254 /* input state */
6255 Py_ssize_t pos=0, size;
6256 int kind;
6257 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258 /* output object */
6259 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006260 /* pointer into the output */
6261 char *str;
6262 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006263 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006264 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6265 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006266 PyObject *errorHandler = NULL;
6267 PyObject *exc = NULL;
6268 /* the following variable is used for caching string comparisons
6269 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6270 int known_errorHandler = -1;
6271
Benjamin Petersonbac79492012-01-14 13:34:47 -05006272 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006273 return NULL;
6274 size = PyUnicode_GET_LENGTH(unicode);
6275 kind = PyUnicode_KIND(unicode);
6276 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277 /* allocate enough for a simple encoding without
6278 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006279 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006280 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006281 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006282 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006283 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006284 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 ressize = size;
6286
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006287 while (pos < size) {
6288 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 /* can we encode this? */
6291 if (c<limit) {
6292 /* no overflow check, because we know that the space is enough */
6293 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006294 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006295 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 Py_ssize_t requiredsize;
6298 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006299 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006301 Py_ssize_t collstart = pos;
6302 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006304 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 ++collend;
6306 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6307 if (known_errorHandler==-1) {
6308 if ((errors==NULL) || (!strcmp(errors, "strict")))
6309 known_errorHandler = 1;
6310 else if (!strcmp(errors, "replace"))
6311 known_errorHandler = 2;
6312 else if (!strcmp(errors, "ignore"))
6313 known_errorHandler = 3;
6314 else if (!strcmp(errors, "xmlcharrefreplace"))
6315 known_errorHandler = 4;
6316 else
6317 known_errorHandler = 0;
6318 }
6319 switch (known_errorHandler) {
6320 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006321 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 goto onError;
6323 case 2: /* replace */
6324 while (collstart++<collend)
6325 *str++ = '?'; /* fall through */
6326 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006327 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 break;
6329 case 4: /* xmlcharrefreplace */
6330 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006331 /* determine replacement size */
6332 for (i = collstart, repsize = 0; i < collend; ++i) {
6333 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6334 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006336 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006338 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006340 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006342 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006344 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006346 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006347 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 if (requiredsize > ressize) {
6353 if (requiredsize<2*ressize)
6354 requiredsize = 2*ressize;
6355 if (_PyBytes_Resize(&res, requiredsize))
6356 goto onError;
6357 str = PyBytes_AS_STRING(res) + respos;
6358 ressize = requiredsize;
6359 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006360 /* generate replacement */
6361 for (i = collstart; i < collend; ++i) {
6362 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 break;
6366 default:
6367 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 encoding, reason, unicode, &exc,
6369 collstart, collend, &newpos);
6370 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006371 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006373 if (PyBytes_Check(repunicode)) {
6374 /* Directly copy bytes result to output. */
6375 repsize = PyBytes_Size(repunicode);
6376 if (repsize > 1) {
6377 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006378 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006379 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6380 Py_DECREF(repunicode);
6381 goto onError;
6382 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006383 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006384 ressize += repsize-1;
6385 }
6386 memcpy(str, PyBytes_AsString(repunicode), repsize);
6387 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006389 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006390 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006391 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 /* need more space? (at least enough for what we
6393 have+the replacement+the rest of the string, so
6394 we won't have to check space for encodable characters) */
6395 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006396 repsize = PyUnicode_GET_LENGTH(repunicode);
6397 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 if (requiredsize > ressize) {
6399 if (requiredsize<2*ressize)
6400 requiredsize = 2*ressize;
6401 if (_PyBytes_Resize(&res, requiredsize)) {
6402 Py_DECREF(repunicode);
6403 goto onError;
6404 }
6405 str = PyBytes_AS_STRING(res) + respos;
6406 ressize = requiredsize;
6407 }
6408 /* check if there is anything unencodable in the replacement
6409 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006410 for (i = 0; repsize-->0; ++i, ++str) {
6411 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006413 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 Py_DECREF(repunicode);
6416 goto onError;
6417 }
6418 *str = (char)c;
6419 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006421 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006423 }
6424 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006425 /* Resize if we allocated to much */
6426 size = str - PyBytes_AS_STRING(res);
6427 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006428 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006429 if (_PyBytes_Resize(&res, size) < 0)
6430 goto onError;
6431 }
6432
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 Py_XDECREF(errorHandler);
6434 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006435 return res;
6436
6437 onError:
6438 Py_XDECREF(res);
6439 Py_XDECREF(errorHandler);
6440 Py_XDECREF(exc);
6441 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442}
6443
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006445PyObject *
6446PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006447 Py_ssize_t size,
6448 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006450 PyObject *result;
6451 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6452 if (unicode == NULL)
6453 return NULL;
6454 result = unicode_encode_ucs1(unicode, errors, 256);
6455 Py_DECREF(unicode);
6456 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457}
6458
Alexander Belopolsky40018472011-02-26 01:02:56 +00006459PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006460_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461{
6462 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 PyErr_BadArgument();
6464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006466 if (PyUnicode_READY(unicode) == -1)
6467 return NULL;
6468 /* Fast path: if it is a one-byte string, construct
6469 bytes object directly. */
6470 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6471 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6472 PyUnicode_GET_LENGTH(unicode));
6473 /* Non-Latin-1 characters present. Defer to above function to
6474 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006475 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006476}
6477
6478PyObject*
6479PyUnicode_AsLatin1String(PyObject *unicode)
6480{
6481 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482}
6483
6484/* --- 7-bit ASCII Codec -------------------------------------------------- */
6485
Alexander Belopolsky40018472011-02-26 01:02:56 +00006486PyObject *
6487PyUnicode_DecodeASCII(const char *s,
6488 Py_ssize_t size,
6489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006492 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006493 int kind;
6494 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 Py_ssize_t startinpos;
6496 Py_ssize_t endinpos;
6497 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498 const char *e;
6499 PyObject *errorHandler = NULL;
6500 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006501
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006503 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006504
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006506 if (size == 1 && (unsigned char)s[0] < 128)
6507 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006508
Victor Stinner8f674cc2013-04-17 23:02:17 +02006509 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006510 writer.min_length = size;
6511 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006512 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006514 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006515 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006516 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006517 writer.pos = outpos;
6518 if (writer.pos == size)
6519 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006520
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006521 s += writer.pos;
6522 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006524 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006526 PyUnicode_WRITE(kind, data, writer.pos, c);
6527 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 ++s;
6529 }
6530 else {
6531 startinpos = s-starts;
6532 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006533 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 errors, &errorHandler,
6535 "ascii", "ordinal not in range(128)",
6536 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006537 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006539 kind = writer.kind;
6540 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 Py_XDECREF(errorHandler);
6544 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006545 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006546
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006548 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006549 Py_XDECREF(errorHandler);
6550 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 return NULL;
6552}
6553
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006555PyObject *
6556PyUnicode_EncodeASCII(const Py_UNICODE *p,
6557 Py_ssize_t size,
6558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 PyObject *result;
6561 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6562 if (unicode == NULL)
6563 return NULL;
6564 result = unicode_encode_ucs1(unicode, errors, 128);
6565 Py_DECREF(unicode);
6566 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567}
6568
Alexander Belopolsky40018472011-02-26 01:02:56 +00006569PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006570_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
6572 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 PyErr_BadArgument();
6574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006576 if (PyUnicode_READY(unicode) == -1)
6577 return NULL;
6578 /* Fast path: if it is an ASCII-only string, construct bytes object
6579 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006580 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006581 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6582 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006583 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006584}
6585
6586PyObject *
6587PyUnicode_AsASCIIString(PyObject *unicode)
6588{
6589 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590}
6591
Victor Stinner99b95382011-07-04 14:23:54 +02006592#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006593
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006594/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006595
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006596#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006597#define NEED_RETRY
6598#endif
6599
Victor Stinner3a50e702011-10-18 21:21:00 +02006600#ifndef WC_ERR_INVALID_CHARS
6601# define WC_ERR_INVALID_CHARS 0x0080
6602#endif
6603
6604static char*
6605code_page_name(UINT code_page, PyObject **obj)
6606{
6607 *obj = NULL;
6608 if (code_page == CP_ACP)
6609 return "mbcs";
6610 if (code_page == CP_UTF7)
6611 return "CP_UTF7";
6612 if (code_page == CP_UTF8)
6613 return "CP_UTF8";
6614
6615 *obj = PyBytes_FromFormat("cp%u", code_page);
6616 if (*obj == NULL)
6617 return NULL;
6618 return PyBytes_AS_STRING(*obj);
6619}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006620
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006622is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006623{
6624 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006625 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626
Victor Stinner3a50e702011-10-18 21:21:00 +02006627 if (!IsDBCSLeadByteEx(code_page, *curr))
6628 return 0;
6629
6630 prev = CharPrevExA(code_page, s, curr, 0);
6631 if (prev == curr)
6632 return 1;
6633 /* FIXME: This code is limited to "true" double-byte encodings,
6634 as it assumes an incomplete character consists of a single
6635 byte. */
6636 if (curr - prev == 2)
6637 return 1;
6638 if (!IsDBCSLeadByteEx(code_page, *prev))
6639 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006640 return 0;
6641}
6642
Victor Stinner3a50e702011-10-18 21:21:00 +02006643static DWORD
6644decode_code_page_flags(UINT code_page)
6645{
6646 if (code_page == CP_UTF7) {
6647 /* The CP_UTF7 decoder only supports flags=0 */
6648 return 0;
6649 }
6650 else
6651 return MB_ERR_INVALID_CHARS;
6652}
6653
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006654/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006655 * Decode a byte string from a Windows code page into unicode object in strict
6656 * mode.
6657 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006658 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6659 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006660 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006661static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006662decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006663 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006664 const char *in,
6665 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006666{
Victor Stinner3a50e702011-10-18 21:21:00 +02006667 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006668 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006669 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006670
6671 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006672 assert(insize > 0);
6673 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6674 if (outsize <= 0)
6675 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006676
6677 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006679 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006680 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 if (*v == NULL)
6682 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006683 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006684 }
6685 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006687 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006688 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006690 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006691 }
6692
6693 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006694 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6695 if (outsize <= 0)
6696 goto error;
6697 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006698
Victor Stinner3a50e702011-10-18 21:21:00 +02006699error:
6700 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6701 return -2;
6702 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006703 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006704}
6705
Victor Stinner3a50e702011-10-18 21:21:00 +02006706/*
6707 * Decode a byte string from a code page into unicode object with an error
6708 * handler.
6709 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006710 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006711 * UnicodeDecodeError exception and returns -1 on error.
6712 */
6713static int
6714decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006715 PyObject **v,
6716 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006717 const char *errors)
6718{
6719 const char *startin = in;
6720 const char *endin = in + size;
6721 const DWORD flags = decode_code_page_flags(code_page);
6722 /* Ideally, we should get reason from FormatMessage. This is the Windows
6723 2000 English version of the message. */
6724 const char *reason = "No mapping for the Unicode character exists "
6725 "in the target code page.";
6726 /* each step cannot decode more than 1 character, but a character can be
6727 represented as a surrogate pair */
6728 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006729 int insize;
6730 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 PyObject *errorHandler = NULL;
6732 PyObject *exc = NULL;
6733 PyObject *encoding_obj = NULL;
6734 char *encoding;
6735 DWORD err;
6736 int ret = -1;
6737
6738 assert(size > 0);
6739
6740 encoding = code_page_name(code_page, &encoding_obj);
6741 if (encoding == NULL)
6742 return -1;
6743
6744 if (errors == NULL || strcmp(errors, "strict") == 0) {
6745 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6746 UnicodeDecodeError. */
6747 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6748 if (exc != NULL) {
6749 PyCodec_StrictErrors(exc);
6750 Py_CLEAR(exc);
6751 }
6752 goto error;
6753 }
6754
6755 if (*v == NULL) {
6756 /* Create unicode object */
6757 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6758 PyErr_NoMemory();
6759 goto error;
6760 }
Victor Stinnerab595942011-12-17 04:59:06 +01006761 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006762 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006763 if (*v == NULL)
6764 goto error;
6765 startout = PyUnicode_AS_UNICODE(*v);
6766 }
6767 else {
6768 /* Extend unicode object */
6769 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6770 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6771 PyErr_NoMemory();
6772 goto error;
6773 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006774 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006775 goto error;
6776 startout = PyUnicode_AS_UNICODE(*v) + n;
6777 }
6778
6779 /* Decode the byte string character per character */
6780 out = startout;
6781 while (in < endin)
6782 {
6783 /* Decode a character */
6784 insize = 1;
6785 do
6786 {
6787 outsize = MultiByteToWideChar(code_page, flags,
6788 in, insize,
6789 buffer, Py_ARRAY_LENGTH(buffer));
6790 if (outsize > 0)
6791 break;
6792 err = GetLastError();
6793 if (err != ERROR_NO_UNICODE_TRANSLATION
6794 && err != ERROR_INSUFFICIENT_BUFFER)
6795 {
6796 PyErr_SetFromWindowsErr(0);
6797 goto error;
6798 }
6799 insize++;
6800 }
6801 /* 4=maximum length of a UTF-8 sequence */
6802 while (insize <= 4 && (in + insize) <= endin);
6803
6804 if (outsize <= 0) {
6805 Py_ssize_t startinpos, endinpos, outpos;
6806
6807 startinpos = in - startin;
6808 endinpos = startinpos + 1;
6809 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006810 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006811 errors, &errorHandler,
6812 encoding, reason,
6813 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006814 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006815 {
6816 goto error;
6817 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006818 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006819 }
6820 else {
6821 in += insize;
6822 memcpy(out, buffer, outsize * sizeof(wchar_t));
6823 out += outsize;
6824 }
6825 }
6826
6827 /* write a NUL character at the end */
6828 *out = 0;
6829
6830 /* Extend unicode object */
6831 outsize = out - startout;
6832 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006833 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006834 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006835 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006836
6837error:
6838 Py_XDECREF(encoding_obj);
6839 Py_XDECREF(errorHandler);
6840 Py_XDECREF(exc);
6841 return ret;
6842}
6843
Victor Stinner3a50e702011-10-18 21:21:00 +02006844static PyObject *
6845decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006846 const char *s, Py_ssize_t size,
6847 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848{
Victor Stinner76a31a62011-11-04 00:05:13 +01006849 PyObject *v = NULL;
6850 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006851
Victor Stinner3a50e702011-10-18 21:21:00 +02006852 if (code_page < 0) {
6853 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6854 return NULL;
6855 }
6856
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006857 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006859
Victor Stinner76a31a62011-11-04 00:05:13 +01006860 do
6861 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006863 if (size > INT_MAX) {
6864 chunk_size = INT_MAX;
6865 final = 0;
6866 done = 0;
6867 }
6868 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006870 {
6871 chunk_size = (int)size;
6872 final = (consumed == NULL);
6873 done = 1;
6874 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006875
Victor Stinner76a31a62011-11-04 00:05:13 +01006876 /* Skip trailing lead-byte unless 'final' is set */
6877 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6878 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879
Victor Stinner76a31a62011-11-04 00:05:13 +01006880 if (chunk_size == 0 && done) {
6881 if (v != NULL)
6882 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006883 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006884 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885
Victor Stinner76a31a62011-11-04 00:05:13 +01006886
6887 converted = decode_code_page_strict(code_page, &v,
6888 s, chunk_size);
6889 if (converted == -2)
6890 converted = decode_code_page_errors(code_page, &v,
6891 s, chunk_size,
6892 errors);
6893 assert(converted != 0);
6894
6895 if (converted < 0) {
6896 Py_XDECREF(v);
6897 return NULL;
6898 }
6899
6900 if (consumed)
6901 *consumed += converted;
6902
6903 s += converted;
6904 size -= converted;
6905 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006906
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006907 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006908}
6909
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006911PyUnicode_DecodeCodePageStateful(int code_page,
6912 const char *s,
6913 Py_ssize_t size,
6914 const char *errors,
6915 Py_ssize_t *consumed)
6916{
6917 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6918}
6919
6920PyObject *
6921PyUnicode_DecodeMBCSStateful(const char *s,
6922 Py_ssize_t size,
6923 const char *errors,
6924 Py_ssize_t *consumed)
6925{
6926 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6927}
6928
6929PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930PyUnicode_DecodeMBCS(const char *s,
6931 Py_ssize_t size,
6932 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006933{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6935}
6936
Victor Stinner3a50e702011-10-18 21:21:00 +02006937static DWORD
6938encode_code_page_flags(UINT code_page, const char *errors)
6939{
6940 if (code_page == CP_UTF8) {
6941 if (winver.dwMajorVersion >= 6)
6942 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6943 and later */
6944 return WC_ERR_INVALID_CHARS;
6945 else
6946 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6947 return 0;
6948 }
6949 else if (code_page == CP_UTF7) {
6950 /* CP_UTF7 only supports flags=0 */
6951 return 0;
6952 }
6953 else {
6954 if (errors != NULL && strcmp(errors, "replace") == 0)
6955 return 0;
6956 else
6957 return WC_NO_BEST_FIT_CHARS;
6958 }
6959}
6960
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006962 * Encode a Unicode string to a Windows code page into a byte string in strict
6963 * mode.
6964 *
6965 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006966 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006967 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006968static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006969encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006970 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006972{
Victor Stinner554f3f02010-06-16 23:33:54 +00006973 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006974 BOOL *pusedDefaultChar = &usedDefaultChar;
6975 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006976 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006977 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006978 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 const DWORD flags = encode_code_page_flags(code_page, NULL);
6980 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006981 /* Create a substring so that we can get the UTF-16 representation
6982 of just the slice under consideration. */
6983 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006984
Martin v. Löwis3d325192011-11-04 18:23:06 +01006985 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006986
Victor Stinner3a50e702011-10-18 21:21:00 +02006987 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006988 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006989 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006990 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006991
Victor Stinner2fc507f2011-11-04 20:06:39 +01006992 substring = PyUnicode_Substring(unicode, offset, offset+len);
6993 if (substring == NULL)
6994 return -1;
6995 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6996 if (p == NULL) {
6997 Py_DECREF(substring);
6998 return -1;
6999 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007000 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007001
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007002 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007003 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007004 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 NULL, 0,
7006 NULL, pusedDefaultChar);
7007 if (outsize <= 0)
7008 goto error;
7009 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007010 if (pusedDefaultChar && *pusedDefaultChar) {
7011 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007012 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007013 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007014
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007018 if (*outbytes == NULL) {
7019 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007021 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007022 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023 }
7024 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007026 const Py_ssize_t n = PyBytes_Size(*outbytes);
7027 if (outsize > PY_SSIZE_T_MAX - n) {
7028 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007029 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007031 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007032 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7033 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007034 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007035 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007036 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037 }
7038
7039 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007040 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007041 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 out, outsize,
7043 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007044 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007045 if (outsize <= 0)
7046 goto error;
7047 if (pusedDefaultChar && *pusedDefaultChar)
7048 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007049 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007050
Victor Stinner3a50e702011-10-18 21:21:00 +02007051error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007052 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7054 return -2;
7055 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007056 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007057}
7058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059/*
7060 * Encode a Unicode string to a Windows code page into a byte string using a
7061 * error handler.
7062 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007063 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007064 * -1 on other error.
7065 */
7066static int
7067encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007068 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007069 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007070{
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007072 Py_ssize_t pos = unicode_offset;
7073 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 /* Ideally, we should get reason from FormatMessage. This is the Windows
7075 2000 English version of the message. */
7076 const char *reason = "invalid character";
7077 /* 4=maximum length of a UTF-8 sequence */
7078 char buffer[4];
7079 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7080 Py_ssize_t outsize;
7081 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 PyObject *errorHandler = NULL;
7083 PyObject *exc = NULL;
7084 PyObject *encoding_obj = NULL;
7085 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007086 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 PyObject *rep;
7088 int ret = -1;
7089
7090 assert(insize > 0);
7091
7092 encoding = code_page_name(code_page, &encoding_obj);
7093 if (encoding == NULL)
7094 return -1;
7095
7096 if (errors == NULL || strcmp(errors, "strict") == 0) {
7097 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7098 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007099 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 if (exc != NULL) {
7101 PyCodec_StrictErrors(exc);
7102 Py_DECREF(exc);
7103 }
7104 Py_XDECREF(encoding_obj);
7105 return -1;
7106 }
7107
7108 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7109 pusedDefaultChar = &usedDefaultChar;
7110 else
7111 pusedDefaultChar = NULL;
7112
7113 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7114 PyErr_NoMemory();
7115 goto error;
7116 }
7117 outsize = insize * Py_ARRAY_LENGTH(buffer);
7118
7119 if (*outbytes == NULL) {
7120 /* Create string object */
7121 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7122 if (*outbytes == NULL)
7123 goto error;
7124 out = PyBytes_AS_STRING(*outbytes);
7125 }
7126 else {
7127 /* Extend string object */
7128 Py_ssize_t n = PyBytes_Size(*outbytes);
7129 if (n > PY_SSIZE_T_MAX - outsize) {
7130 PyErr_NoMemory();
7131 goto error;
7132 }
7133 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7134 goto error;
7135 out = PyBytes_AS_STRING(*outbytes) + n;
7136 }
7137
7138 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007139 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007141 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7142 wchar_t chars[2];
7143 int charsize;
7144 if (ch < 0x10000) {
7145 chars[0] = (wchar_t)ch;
7146 charsize = 1;
7147 }
7148 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007149 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7150 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007151 charsize = 2;
7152 }
7153
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007155 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 buffer, Py_ARRAY_LENGTH(buffer),
7157 NULL, pusedDefaultChar);
7158 if (outsize > 0) {
7159 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7160 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007161 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 memcpy(out, buffer, outsize);
7163 out += outsize;
7164 continue;
7165 }
7166 }
7167 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7168 PyErr_SetFromWindowsErr(0);
7169 goto error;
7170 }
7171
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 rep = unicode_encode_call_errorhandler(
7173 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007174 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007175 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 if (rep == NULL)
7177 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007178 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007179
7180 if (PyBytes_Check(rep)) {
7181 outsize = PyBytes_GET_SIZE(rep);
7182 if (outsize != 1) {
7183 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7184 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7185 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7186 Py_DECREF(rep);
7187 goto error;
7188 }
7189 out = PyBytes_AS_STRING(*outbytes) + offset;
7190 }
7191 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7192 out += outsize;
7193 }
7194 else {
7195 Py_ssize_t i;
7196 enum PyUnicode_Kind kind;
7197 void *data;
7198
Benjamin Petersonbac79492012-01-14 13:34:47 -05007199 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 Py_DECREF(rep);
7201 goto error;
7202 }
7203
7204 outsize = PyUnicode_GET_LENGTH(rep);
7205 if (outsize != 1) {
7206 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7207 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7208 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7209 Py_DECREF(rep);
7210 goto error;
7211 }
7212 out = PyBytes_AS_STRING(*outbytes) + offset;
7213 }
7214 kind = PyUnicode_KIND(rep);
7215 data = PyUnicode_DATA(rep);
7216 for (i=0; i < outsize; i++) {
7217 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7218 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007219 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007220 encoding, unicode,
7221 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 "unable to encode error handler result to ASCII");
7223 Py_DECREF(rep);
7224 goto error;
7225 }
7226 *out = (unsigned char)ch;
7227 out++;
7228 }
7229 }
7230 Py_DECREF(rep);
7231 }
7232 /* write a NUL byte */
7233 *out = 0;
7234 outsize = out - PyBytes_AS_STRING(*outbytes);
7235 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7236 if (_PyBytes_Resize(outbytes, outsize) < 0)
7237 goto error;
7238 ret = 0;
7239
7240error:
7241 Py_XDECREF(encoding_obj);
7242 Py_XDECREF(errorHandler);
7243 Py_XDECREF(exc);
7244 return ret;
7245}
7246
Victor Stinner3a50e702011-10-18 21:21:00 +02007247static PyObject *
7248encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007249 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 const char *errors)
7251{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007252 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007254 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007255 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007256
Benjamin Petersonbac79492012-01-14 13:34:47 -05007257 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007258 return NULL;
7259 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007260
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 if (code_page < 0) {
7262 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7263 return NULL;
7264 }
7265
Martin v. Löwis3d325192011-11-04 18:23:06 +01007266 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007267 return PyBytes_FromStringAndSize(NULL, 0);
7268
Victor Stinner7581cef2011-11-03 22:32:33 +01007269 offset = 0;
7270 do
7271 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007274 chunks. */
7275 if (len > INT_MAX/2) {
7276 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007277 done = 0;
7278 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007279 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007280#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007281 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007282 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007283 done = 1;
7284 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007285
Victor Stinner76a31a62011-11-04 00:05:13 +01007286 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007287 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 errors);
7289 if (ret == -2)
7290 ret = encode_code_page_errors(code_page, &outbytes,
7291 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007292 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007293 if (ret < 0) {
7294 Py_XDECREF(outbytes);
7295 return NULL;
7296 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007297
Victor Stinner7581cef2011-11-03 22:32:33 +01007298 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007299 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007300 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 return outbytes;
7303}
7304
7305PyObject *
7306PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7307 Py_ssize_t size,
7308 const char *errors)
7309{
Victor Stinner7581cef2011-11-03 22:32:33 +01007310 PyObject *unicode, *res;
7311 unicode = PyUnicode_FromUnicode(p, size);
7312 if (unicode == NULL)
7313 return NULL;
7314 res = encode_code_page(CP_ACP, unicode, errors);
7315 Py_DECREF(unicode);
7316 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007317}
7318
7319PyObject *
7320PyUnicode_EncodeCodePage(int code_page,
7321 PyObject *unicode,
7322 const char *errors)
7323{
Victor Stinner7581cef2011-11-03 22:32:33 +01007324 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007325}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007326
Alexander Belopolsky40018472011-02-26 01:02:56 +00007327PyObject *
7328PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007329{
7330 if (!PyUnicode_Check(unicode)) {
7331 PyErr_BadArgument();
7332 return NULL;
7333 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007334 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007335}
7336
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337#undef NEED_RETRY
7338
Victor Stinner99b95382011-07-04 14:23:54 +02007339#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007340
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341/* --- Character Mapping Codec -------------------------------------------- */
7342
Victor Stinnerfb161b12013-04-18 01:44:27 +02007343static int
7344charmap_decode_string(const char *s,
7345 Py_ssize_t size,
7346 PyObject *mapping,
7347 const char *errors,
7348 _PyUnicodeWriter *writer)
7349{
7350 const char *starts = s;
7351 const char *e;
7352 Py_ssize_t startinpos, endinpos;
7353 PyObject *errorHandler = NULL, *exc = NULL;
7354 Py_ssize_t maplen;
7355 enum PyUnicode_Kind mapkind;
7356 void *mapdata;
7357 Py_UCS4 x;
7358 unsigned char ch;
7359
7360 if (PyUnicode_READY(mapping) == -1)
7361 return -1;
7362
7363 maplen = PyUnicode_GET_LENGTH(mapping);
7364 mapdata = PyUnicode_DATA(mapping);
7365 mapkind = PyUnicode_KIND(mapping);
7366
7367 e = s + size;
7368
7369 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7370 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7371 * is disabled in encoding aliases, latin1 is preferred because
7372 * its implementation is faster. */
7373 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7374 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7375 Py_UCS4 maxchar = writer->maxchar;
7376
7377 assert (writer->kind == PyUnicode_1BYTE_KIND);
7378 while (s < e) {
7379 ch = *s;
7380 x = mapdata_ucs1[ch];
7381 if (x > maxchar) {
7382 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7383 goto onError;
7384 maxchar = writer->maxchar;
7385 outdata = (Py_UCS1 *)writer->data;
7386 }
7387 outdata[writer->pos] = x;
7388 writer->pos++;
7389 ++s;
7390 }
7391 return 0;
7392 }
7393
7394 while (s < e) {
7395 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7396 enum PyUnicode_Kind outkind = writer->kind;
7397 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7398 if (outkind == PyUnicode_1BYTE_KIND) {
7399 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7400 Py_UCS4 maxchar = writer->maxchar;
7401 while (s < e) {
7402 ch = *s;
7403 x = mapdata_ucs2[ch];
7404 if (x > maxchar)
7405 goto Error;
7406 outdata[writer->pos] = x;
7407 writer->pos++;
7408 ++s;
7409 }
7410 break;
7411 }
7412 else if (outkind == PyUnicode_2BYTE_KIND) {
7413 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7414 while (s < e) {
7415 ch = *s;
7416 x = mapdata_ucs2[ch];
7417 if (x == 0xFFFE)
7418 goto Error;
7419 outdata[writer->pos] = x;
7420 writer->pos++;
7421 ++s;
7422 }
7423 break;
7424 }
7425 }
7426 ch = *s;
7427
7428 if (ch < maplen)
7429 x = PyUnicode_READ(mapkind, mapdata, ch);
7430 else
7431 x = 0xfffe; /* invalid value */
7432Error:
7433 if (x == 0xfffe)
7434 {
7435 /* undefined mapping */
7436 startinpos = s-starts;
7437 endinpos = startinpos+1;
7438 if (unicode_decode_call_errorhandler_writer(
7439 errors, &errorHandler,
7440 "charmap", "character maps to <undefined>",
7441 &starts, &e, &startinpos, &endinpos, &exc, &s,
7442 writer)) {
7443 goto onError;
7444 }
7445 continue;
7446 }
7447
7448 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7449 goto onError;
7450 ++s;
7451 }
7452 Py_XDECREF(errorHandler);
7453 Py_XDECREF(exc);
7454 return 0;
7455
7456onError:
7457 Py_XDECREF(errorHandler);
7458 Py_XDECREF(exc);
7459 return -1;
7460}
7461
7462static int
7463charmap_decode_mapping(const char *s,
7464 Py_ssize_t size,
7465 PyObject *mapping,
7466 const char *errors,
7467 _PyUnicodeWriter *writer)
7468{
7469 const char *starts = s;
7470 const char *e;
7471 Py_ssize_t startinpos, endinpos;
7472 PyObject *errorHandler = NULL, *exc = NULL;
7473 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007474 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007475
7476 e = s + size;
7477
7478 while (s < e) {
7479 ch = *s;
7480
7481 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7482 key = PyLong_FromLong((long)ch);
7483 if (key == NULL)
7484 goto onError;
7485
7486 item = PyObject_GetItem(mapping, key);
7487 Py_DECREF(key);
7488 if (item == NULL) {
7489 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7490 /* No mapping found means: mapping is undefined. */
7491 PyErr_Clear();
7492 goto Undefined;
7493 } else
7494 goto onError;
7495 }
7496
7497 /* Apply mapping */
7498 if (item == Py_None)
7499 goto Undefined;
7500 if (PyLong_Check(item)) {
7501 long value = PyLong_AS_LONG(item);
7502 if (value == 0xFFFE)
7503 goto Undefined;
7504 if (value < 0 || value > MAX_UNICODE) {
7505 PyErr_Format(PyExc_TypeError,
7506 "character mapping must be in range(0x%lx)",
7507 (unsigned long)MAX_UNICODE + 1);
7508 goto onError;
7509 }
7510
7511 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7512 goto onError;
7513 }
7514 else if (PyUnicode_Check(item)) {
7515 if (PyUnicode_READY(item) == -1)
7516 goto onError;
7517 if (PyUnicode_GET_LENGTH(item) == 1) {
7518 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7519 if (value == 0xFFFE)
7520 goto Undefined;
7521 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7522 goto onError;
7523 }
7524 else {
7525 writer->overallocate = 1;
7526 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7527 goto onError;
7528 }
7529 }
7530 else {
7531 /* wrong return value */
7532 PyErr_SetString(PyExc_TypeError,
7533 "character mapping must return integer, None or str");
7534 goto onError;
7535 }
7536 Py_CLEAR(item);
7537 ++s;
7538 continue;
7539
7540Undefined:
7541 /* undefined mapping */
7542 Py_CLEAR(item);
7543 startinpos = s-starts;
7544 endinpos = startinpos+1;
7545 if (unicode_decode_call_errorhandler_writer(
7546 errors, &errorHandler,
7547 "charmap", "character maps to <undefined>",
7548 &starts, &e, &startinpos, &endinpos, &exc, &s,
7549 writer)) {
7550 goto onError;
7551 }
7552 }
7553 Py_XDECREF(errorHandler);
7554 Py_XDECREF(exc);
7555 return 0;
7556
7557onError:
7558 Py_XDECREF(item);
7559 Py_XDECREF(errorHandler);
7560 Py_XDECREF(exc);
7561 return -1;
7562}
7563
Alexander Belopolsky40018472011-02-26 01:02:56 +00007564PyObject *
7565PyUnicode_DecodeCharmap(const char *s,
7566 Py_ssize_t size,
7567 PyObject *mapping,
7568 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007570 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007571
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 /* Default to Latin-1 */
7573 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007577 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007578 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007579 writer.min_length = size;
7580 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007582
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007583 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007584 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7585 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007586 }
7587 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007588 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7589 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007591 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007592
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007594 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 return NULL;
7596}
7597
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007598/* Charmap encoding: the lookup table */
7599
Alexander Belopolsky40018472011-02-26 01:02:56 +00007600struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 PyObject_HEAD
7602 unsigned char level1[32];
7603 int count2, count3;
7604 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007605};
7606
7607static PyObject*
7608encoding_map_size(PyObject *obj, PyObject* args)
7609{
7610 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007613}
7614
7615static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007616 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 PyDoc_STR("Return the size (in bytes) of this object") },
7618 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619};
7620
7621static void
7622encoding_map_dealloc(PyObject* o)
7623{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007624 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007625}
7626
7627static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 "EncodingMap", /*tp_name*/
7630 sizeof(struct encoding_map), /*tp_basicsize*/
7631 0, /*tp_itemsize*/
7632 /* methods */
7633 encoding_map_dealloc, /*tp_dealloc*/
7634 0, /*tp_print*/
7635 0, /*tp_getattr*/
7636 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007637 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 0, /*tp_repr*/
7639 0, /*tp_as_number*/
7640 0, /*tp_as_sequence*/
7641 0, /*tp_as_mapping*/
7642 0, /*tp_hash*/
7643 0, /*tp_call*/
7644 0, /*tp_str*/
7645 0, /*tp_getattro*/
7646 0, /*tp_setattro*/
7647 0, /*tp_as_buffer*/
7648 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7649 0, /*tp_doc*/
7650 0, /*tp_traverse*/
7651 0, /*tp_clear*/
7652 0, /*tp_richcompare*/
7653 0, /*tp_weaklistoffset*/
7654 0, /*tp_iter*/
7655 0, /*tp_iternext*/
7656 encoding_map_methods, /*tp_methods*/
7657 0, /*tp_members*/
7658 0, /*tp_getset*/
7659 0, /*tp_base*/
7660 0, /*tp_dict*/
7661 0, /*tp_descr_get*/
7662 0, /*tp_descr_set*/
7663 0, /*tp_dictoffset*/
7664 0, /*tp_init*/
7665 0, /*tp_alloc*/
7666 0, /*tp_new*/
7667 0, /*tp_free*/
7668 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007669};
7670
7671PyObject*
7672PyUnicode_BuildEncodingMap(PyObject* string)
7673{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007674 PyObject *result;
7675 struct encoding_map *mresult;
7676 int i;
7677 int need_dict = 0;
7678 unsigned char level1[32];
7679 unsigned char level2[512];
7680 unsigned char *mlevel1, *mlevel2, *mlevel3;
7681 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007682 int kind;
7683 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007684 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007685 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007686
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007687 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007688 PyErr_BadArgument();
7689 return NULL;
7690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007691 kind = PyUnicode_KIND(string);
7692 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007693 length = PyUnicode_GET_LENGTH(string);
7694 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007695 memset(level1, 0xFF, sizeof level1);
7696 memset(level2, 0xFF, sizeof level2);
7697
7698 /* If there isn't a one-to-one mapping of NULL to \0,
7699 or if there are non-BMP characters, we need to use
7700 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007701 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007702 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007703 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007705 ch = PyUnicode_READ(kind, data, i);
7706 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007707 need_dict = 1;
7708 break;
7709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007710 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007711 /* unmapped character */
7712 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007713 l1 = ch >> 11;
7714 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007715 if (level1[l1] == 0xFF)
7716 level1[l1] = count2++;
7717 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007718 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007719 }
7720
7721 if (count2 >= 0xFF || count3 >= 0xFF)
7722 need_dict = 1;
7723
7724 if (need_dict) {
7725 PyObject *result = PyDict_New();
7726 PyObject *key, *value;
7727 if (!result)
7728 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007729 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007731 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007732 if (!key || !value)
7733 goto failed1;
7734 if (PyDict_SetItem(result, key, value) == -1)
7735 goto failed1;
7736 Py_DECREF(key);
7737 Py_DECREF(value);
7738 }
7739 return result;
7740 failed1:
7741 Py_XDECREF(key);
7742 Py_XDECREF(value);
7743 Py_DECREF(result);
7744 return NULL;
7745 }
7746
7747 /* Create a three-level trie */
7748 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7749 16*count2 + 128*count3 - 1);
7750 if (!result)
7751 return PyErr_NoMemory();
7752 PyObject_Init(result, &EncodingMapType);
7753 mresult = (struct encoding_map*)result;
7754 mresult->count2 = count2;
7755 mresult->count3 = count3;
7756 mlevel1 = mresult->level1;
7757 mlevel2 = mresult->level23;
7758 mlevel3 = mresult->level23 + 16*count2;
7759 memcpy(mlevel1, level1, 32);
7760 memset(mlevel2, 0xFF, 16*count2);
7761 memset(mlevel3, 0, 128*count3);
7762 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007763 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007764 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007765 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7766 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767 /* unmapped character */
7768 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007769 o1 = ch>>11;
7770 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007771 i2 = 16*mlevel1[o1] + o2;
7772 if (mlevel2[i2] == 0xFF)
7773 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007774 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775 i3 = 128*mlevel2[i2] + o3;
7776 mlevel3[i3] = i;
7777 }
7778 return result;
7779}
7780
7781static int
Victor Stinner22168992011-11-20 17:09:18 +01007782encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783{
7784 struct encoding_map *map = (struct encoding_map*)mapping;
7785 int l1 = c>>11;
7786 int l2 = (c>>7) & 0xF;
7787 int l3 = c & 0x7F;
7788 int i;
7789
Victor Stinner22168992011-11-20 17:09:18 +01007790 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007792 if (c == 0)
7793 return 0;
7794 /* level 1*/
7795 i = map->level1[l1];
7796 if (i == 0xFF) {
7797 return -1;
7798 }
7799 /* level 2*/
7800 i = map->level23[16*i+l2];
7801 if (i == 0xFF) {
7802 return -1;
7803 }
7804 /* level 3 */
7805 i = map->level23[16*map->count2 + 128*i + l3];
7806 if (i == 0) {
7807 return -1;
7808 }
7809 return i;
7810}
7811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812/* Lookup the character ch in the mapping. If the character
7813 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007814 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007815static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007816charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817{
Christian Heimes217cfd12007-12-02 14:31:20 +00007818 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007819 PyObject *x;
7820
7821 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823 x = PyObject_GetItem(mapping, w);
7824 Py_DECREF(w);
7825 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7827 /* No mapping found means: mapping is undefined. */
7828 PyErr_Clear();
7829 x = Py_None;
7830 Py_INCREF(x);
7831 return x;
7832 } else
7833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007835 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007837 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 long value = PyLong_AS_LONG(x);
7839 if (value < 0 || value > 255) {
7840 PyErr_SetString(PyExc_TypeError,
7841 "character mapping must be in range(256)");
7842 Py_DECREF(x);
7843 return NULL;
7844 }
7845 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007847 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 /* wrong return value */
7851 PyErr_Format(PyExc_TypeError,
7852 "character mapping must return integer, bytes or None, not %.400s",
7853 x->ob_type->tp_name);
7854 Py_DECREF(x);
7855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 }
7857}
7858
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007859static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007860charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007862 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7863 /* exponentially overallocate to minimize reallocations */
7864 if (requiredsize < 2*outsize)
7865 requiredsize = 2*outsize;
7866 if (_PyBytes_Resize(outobj, requiredsize))
7867 return -1;
7868 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007869}
7870
Benjamin Peterson14339b62009-01-31 16:36:08 +00007871typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007873} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007874/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007875 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876 space is available. Return a new reference to the object that
7877 was put in the output buffer, or Py_None, if the mapping was undefined
7878 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007879 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007880static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007881charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007882 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007883{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007884 PyObject *rep;
7885 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007886 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007887
Christian Heimes90aa7642007-12-19 02:45:37 +00007888 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 if (res == -1)
7892 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 if (outsize<requiredsize)
7894 if (charmapencode_resize(outobj, outpos, requiredsize))
7895 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007896 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 outstart[(*outpos)++] = (char)res;
7898 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899 }
7900
7901 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007902 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007904 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_DECREF(rep);
7906 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007907 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 if (PyLong_Check(rep)) {
7909 Py_ssize_t requiredsize = *outpos+1;
7910 if (outsize<requiredsize)
7911 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7912 Py_DECREF(rep);
7913 return enc_EXCEPTION;
7914 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007915 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007917 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 else {
7919 const char *repchars = PyBytes_AS_STRING(rep);
7920 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7921 Py_ssize_t requiredsize = *outpos+repsize;
7922 if (outsize<requiredsize)
7923 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7924 Py_DECREF(rep);
7925 return enc_EXCEPTION;
7926 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007927 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 memcpy(outstart + *outpos, repchars, repsize);
7929 *outpos += repsize;
7930 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007931 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007932 Py_DECREF(rep);
7933 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934}
7935
7936/* handle an error in PyUnicode_EncodeCharmap
7937 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007938static int
7939charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007940 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007942 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007943 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007944{
7945 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007946 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007947 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007948 enum PyUnicode_Kind kind;
7949 void *data;
7950 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007951 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952 Py_ssize_t collstartpos = *inpos;
7953 Py_ssize_t collendpos = *inpos+1;
7954 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007955 char *encoding = "charmap";
7956 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007957 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007958 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007959 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007960
Benjamin Petersonbac79492012-01-14 13:34:47 -05007961 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007962 return -1;
7963 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007964 /* find all unencodable characters */
7965 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007966 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007967 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007968 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007969 val = encoding_map_lookup(ch, mapping);
7970 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 break;
7972 ++collendpos;
7973 continue;
7974 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007975
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007976 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7977 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 if (rep==NULL)
7979 return -1;
7980 else if (rep!=Py_None) {
7981 Py_DECREF(rep);
7982 break;
7983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007984 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986 }
7987 /* cache callback name lookup
7988 * (if not done yet, i.e. it's the first error) */
7989 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 if ((errors==NULL) || (!strcmp(errors, "strict")))
7991 *known_errorHandler = 1;
7992 else if (!strcmp(errors, "replace"))
7993 *known_errorHandler = 2;
7994 else if (!strcmp(errors, "ignore"))
7995 *known_errorHandler = 3;
7996 else if (!strcmp(errors, "xmlcharrefreplace"))
7997 *known_errorHandler = 4;
7998 else
7999 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008000 }
8001 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008002 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008003 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 return -1;
8005 case 2: /* replace */
8006 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 x = charmapencode_output('?', mapping, res, respos);
8008 if (x==enc_EXCEPTION) {
8009 return -1;
8010 }
8011 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008012 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 return -1;
8014 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008015 }
8016 /* fall through */
8017 case 3: /* ignore */
8018 *inpos = collendpos;
8019 break;
8020 case 4: /* xmlcharrefreplace */
8021 /* generate replacement (temporarily (mis)uses p) */
8022 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 char buffer[2+29+1+1];
8024 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008025 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 for (cp = buffer; *cp; ++cp) {
8027 x = charmapencode_output(*cp, mapping, res, respos);
8028 if (x==enc_EXCEPTION)
8029 return -1;
8030 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008031 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 return -1;
8033 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 }
8035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 *inpos = collendpos;
8037 break;
8038 default:
8039 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008040 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008042 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008044 if (PyBytes_Check(repunicode)) {
8045 /* Directly copy bytes result to output. */
8046 Py_ssize_t outsize = PyBytes_Size(*res);
8047 Py_ssize_t requiredsize;
8048 repsize = PyBytes_Size(repunicode);
8049 requiredsize = *respos + repsize;
8050 if (requiredsize > outsize)
8051 /* Make room for all additional bytes. */
8052 if (charmapencode_resize(res, respos, requiredsize)) {
8053 Py_DECREF(repunicode);
8054 return -1;
8055 }
8056 memcpy(PyBytes_AsString(*res) + *respos,
8057 PyBytes_AsString(repunicode), repsize);
8058 *respos += repsize;
8059 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008060 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008061 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008062 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008063 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008064 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008065 Py_DECREF(repunicode);
8066 return -1;
8067 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008068 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008069 data = PyUnicode_DATA(repunicode);
8070 kind = PyUnicode_KIND(repunicode);
8071 for (index = 0; index < repsize; index++) {
8072 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8073 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008075 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 return -1;
8077 }
8078 else if (x==enc_FAILED) {
8079 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008080 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 return -1;
8082 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 }
8084 *inpos = newpos;
8085 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086 }
8087 return 0;
8088}
8089
Alexander Belopolsky40018472011-02-26 01:02:56 +00008090PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091_PyUnicode_EncodeCharmap(PyObject *unicode,
8092 PyObject *mapping,
8093 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095 /* output object */
8096 PyObject *res = NULL;
8097 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008098 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008099 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008101 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 PyObject *errorHandler = NULL;
8103 PyObject *exc = NULL;
8104 /* the following variable is used for caching string comparisons
8105 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8106 * 3=ignore, 4=xmlcharrefreplace */
8107 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008108 void *data;
8109 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110
Benjamin Petersonbac79492012-01-14 13:34:47 -05008111 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008112 return NULL;
8113 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008114 data = PyUnicode_DATA(unicode);
8115 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008116
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 /* Default to Latin-1 */
8118 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008119 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 /* allocate enough for a simple encoding without
8122 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008123 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008124 if (res == NULL)
8125 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008126 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008130 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008132 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 if (x==enc_EXCEPTION) /* error */
8134 goto onError;
8135 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008136 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 &exc,
8138 &known_errorHandler, &errorHandler, errors,
8139 &res, &respos)) {
8140 goto onError;
8141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 else
8144 /* done with this character => adjust input position */
8145 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008149 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008150 if (_PyBytes_Resize(&res, respos) < 0)
8151 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008152
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153 Py_XDECREF(exc);
8154 Py_XDECREF(errorHandler);
8155 return res;
8156
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 Py_XDECREF(res);
8159 Py_XDECREF(exc);
8160 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 return NULL;
8162}
8163
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008164/* Deprecated */
8165PyObject *
8166PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8167 Py_ssize_t size,
8168 PyObject *mapping,
8169 const char *errors)
8170{
8171 PyObject *result;
8172 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8173 if (unicode == NULL)
8174 return NULL;
8175 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8176 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008177 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008178}
8179
Alexander Belopolsky40018472011-02-26 01:02:56 +00008180PyObject *
8181PyUnicode_AsCharmapString(PyObject *unicode,
8182 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183{
8184 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 PyErr_BadArgument();
8186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008188 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189}
8190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008192static void
8193make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008194 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008195 Py_ssize_t startpos, Py_ssize_t endpos,
8196 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199 *exceptionObject = _PyUnicodeTranslateError_Create(
8200 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 }
8202 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8204 goto onError;
8205 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8206 goto onError;
8207 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8208 goto onError;
8209 return;
8210 onError:
8211 Py_DECREF(*exceptionObject);
8212 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 }
8214}
8215
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216/* error handling callback helper:
8217 build arguments, call the callback and check the arguments,
8218 put the result into newpos and return the replacement string, which
8219 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008220static PyObject *
8221unicode_translate_call_errorhandler(const char *errors,
8222 PyObject **errorHandler,
8223 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008224 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008225 Py_ssize_t startpos, Py_ssize_t endpos,
8226 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008228 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008230 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231 PyObject *restuple;
8232 PyObject *resunicode;
8233
8234 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238 }
8239
8240 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244
8245 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008250 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 Py_DECREF(restuple);
8252 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 }
8254 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 &resunicode, &i_newpos)) {
8256 Py_DECREF(restuple);
8257 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008259 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008261 else
8262 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8265 Py_DECREF(restuple);
8266 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 Py_INCREF(resunicode);
8269 Py_DECREF(restuple);
8270 return resunicode;
8271}
8272
8273/* Lookup the character ch in the mapping and put the result in result,
8274 which must be decrefed by the caller.
8275 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008276static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278{
Christian Heimes217cfd12007-12-02 14:31:20 +00008279 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 PyObject *x;
8281
8282 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284 x = PyObject_GetItem(mapping, w);
8285 Py_DECREF(w);
8286 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8288 /* No mapping found means: use 1:1 mapping. */
8289 PyErr_Clear();
8290 *result = NULL;
8291 return 0;
8292 } else
8293 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 }
8295 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 *result = x;
8297 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008299 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 long value = PyLong_AS_LONG(x);
8301 long max = PyUnicode_GetMax();
8302 if (value < 0 || value > max) {
8303 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008304 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 Py_DECREF(x);
8306 return -1;
8307 }
8308 *result = x;
8309 return 0;
8310 }
8311 else if (PyUnicode_Check(x)) {
8312 *result = x;
8313 return 0;
8314 }
8315 else {
8316 /* wrong return value */
8317 PyErr_SetString(PyExc_TypeError,
8318 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008319 Py_DECREF(x);
8320 return -1;
8321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322}
8323/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 if not reallocate and adjust various state variables.
8325 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008331 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008332 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 /* exponentially overallocate to minimize reallocations */
8334 if (requiredsize < 2 * oldsize)
8335 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008336 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8337 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008339 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 }
8342 return 0;
8343}
8344/* lookup the character, put the result in the output string and adjust
8345 various state variables. Return a new reference to the object that
8346 was put in the output buffer in *result, or Py_None, if the mapping was
8347 undefined (in which case no character was written).
8348 The called must decref result.
8349 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008350static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8352 PyObject *mapping, Py_UCS4 **output,
8353 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8357 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 }
8363 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008365 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 }
8369 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 Py_ssize_t repsize;
8371 if (PyUnicode_READY(*res) == -1)
8372 return -1;
8373 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 if (repsize==1) {
8375 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 }
8378 else if (repsize!=0) {
8379 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 Py_ssize_t requiredsize = *opos +
8381 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 Py_ssize_t i;
8384 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 for(i = 0; i < repsize; i++)
8387 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 }
8390 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 return 0;
8393}
8394
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396_PyUnicode_TranslateCharmap(PyObject *input,
8397 PyObject *mapping,
8398 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 /* input object */
8401 char *idata;
8402 Py_ssize_t size, i;
8403 int kind;
8404 /* output buffer */
8405 Py_UCS4 *output = NULL;
8406 Py_ssize_t osize;
8407 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 char *reason = "character maps to <undefined>";
8411 PyObject *errorHandler = NULL;
8412 PyObject *exc = NULL;
8413 /* the following variable is used for caching string comparisons
8414 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8415 * 3=ignore, 4=xmlcharrefreplace */
8416 int known_errorHandler = -1;
8417
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 PyErr_BadArgument();
8420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423 if (PyUnicode_READY(input) == -1)
8424 return NULL;
8425 idata = (char*)PyUnicode_DATA(input);
8426 kind = PyUnicode_KIND(input);
8427 size = PyUnicode_GET_LENGTH(input);
8428 i = 0;
8429
8430 if (size == 0) {
8431 Py_INCREF(input);
8432 return input;
8433 }
8434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 /* allocate enough for a simple 1:1 translation without
8436 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 osize = size;
8438 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8439 opos = 0;
8440 if (output == NULL) {
8441 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008445 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 /* try to encode it */
8447 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 if (charmaptranslate_output(input, i, mapping,
8449 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 Py_XDECREF(x);
8451 goto onError;
8452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 else { /* untranslatable character */
8457 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8458 Py_ssize_t repsize;
8459 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 Py_ssize_t collstart = i;
8463 Py_ssize_t collend = i+1;
8464 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 while (collend < size) {
8468 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 goto onError;
8470 Py_XDECREF(x);
8471 if (x!=Py_None)
8472 break;
8473 ++collend;
8474 }
8475 /* cache callback name lookup
8476 * (if not done yet, i.e. it's the first error) */
8477 if (known_errorHandler==-1) {
8478 if ((errors==NULL) || (!strcmp(errors, "strict")))
8479 known_errorHandler = 1;
8480 else if (!strcmp(errors, "replace"))
8481 known_errorHandler = 2;
8482 else if (!strcmp(errors, "ignore"))
8483 known_errorHandler = 3;
8484 else if (!strcmp(errors, "xmlcharrefreplace"))
8485 known_errorHandler = 4;
8486 else
8487 known_errorHandler = 0;
8488 }
8489 switch (known_errorHandler) {
8490 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008491 make_translate_exception(&exc,
8492 input, collstart, collend, reason);
8493 if (exc != NULL)
8494 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008495 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 case 2: /* replace */
8497 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 for (coll = collstart; coll<collend; coll++)
8499 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 /* fall through */
8501 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 break;
8504 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 /* generate replacement (temporarily (mis)uses i) */
8506 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 char buffer[2+29+1+1];
8508 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8510 if (charmaptranslate_makespace(&output, &osize,
8511 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 goto onError;
8513 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 break;
8518 default:
8519 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 reason, input, &exc,
8521 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008522 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008524 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008525 Py_DECREF(repunicode);
8526 goto onError;
8527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 repsize = PyUnicode_GET_LENGTH(repunicode);
8530 if (charmaptranslate_makespace(&output, &osize,
8531 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 Py_DECREF(repunicode);
8533 goto onError;
8534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 for (uni2 = 0; repsize-->0; ++uni2)
8536 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8537 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008539 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008540 }
8541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8543 if (!res)
8544 goto onError;
8545 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 Py_XDECREF(exc);
8547 Py_XDECREF(errorHandler);
8548 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 Py_XDECREF(exc);
8553 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 return NULL;
8555}
8556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557/* Deprecated. Use PyUnicode_Translate instead. */
8558PyObject *
8559PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8560 Py_ssize_t size,
8561 PyObject *mapping,
8562 const char *errors)
8563{
Christian Heimes5f520f42012-09-11 14:03:25 +02008564 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8566 if (!unicode)
8567 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008568 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8569 Py_DECREF(unicode);
8570 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571}
8572
Alexander Belopolsky40018472011-02-26 01:02:56 +00008573PyObject *
8574PyUnicode_Translate(PyObject *str,
8575 PyObject *mapping,
8576 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577{
8578 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008579
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 str = PyUnicode_FromObject(str);
8581 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008582 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 Py_DECREF(str);
8585 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586}
Tim Petersced69f82003-09-16 20:30:58 +00008587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008589fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590{
8591 /* No need to call PyUnicode_READY(self) because this function is only
8592 called as a callback from fixup() which does it already. */
8593 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8594 const int kind = PyUnicode_KIND(self);
8595 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008596 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008597 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 Py_ssize_t i;
8599
8600 for (i = 0; i < len; ++i) {
8601 ch = PyUnicode_READ(kind, data, i);
8602 fixed = 0;
8603 if (ch > 127) {
8604 if (Py_UNICODE_ISSPACE(ch))
8605 fixed = ' ';
8606 else {
8607 const int decimal = Py_UNICODE_TODECIMAL(ch);
8608 if (decimal >= 0)
8609 fixed = '0' + decimal;
8610 }
8611 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008612 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008613 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 PyUnicode_WRITE(kind, data, i, fixed);
8615 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008616 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008617 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 }
8620
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008621 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622}
8623
8624PyObject *
8625_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8626{
8627 if (!PyUnicode_Check(unicode)) {
8628 PyErr_BadInternalCall();
8629 return NULL;
8630 }
8631 if (PyUnicode_READY(unicode) == -1)
8632 return NULL;
8633 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8634 /* If the string is already ASCII, just return the same string */
8635 Py_INCREF(unicode);
8636 return unicode;
8637 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008638 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639}
8640
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008641PyObject *
8642PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8643 Py_ssize_t length)
8644{
Victor Stinnerf0124502011-11-21 23:12:56 +01008645 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008646 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008647 Py_UCS4 maxchar;
8648 enum PyUnicode_Kind kind;
8649 void *data;
8650
Victor Stinner99d7ad02012-02-22 13:37:39 +01008651 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008652 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008653 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008654 if (ch > 127) {
8655 int decimal = Py_UNICODE_TODECIMAL(ch);
8656 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008657 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008658 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008659 }
8660 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008661
8662 /* Copy to a new string */
8663 decimal = PyUnicode_New(length, maxchar);
8664 if (decimal == NULL)
8665 return decimal;
8666 kind = PyUnicode_KIND(decimal);
8667 data = PyUnicode_DATA(decimal);
8668 /* Iterate over code points */
8669 for (i = 0; i < length; i++) {
8670 Py_UNICODE ch = s[i];
8671 if (ch > 127) {
8672 int decimal = Py_UNICODE_TODECIMAL(ch);
8673 if (decimal >= 0)
8674 ch = '0' + decimal;
8675 }
8676 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008678 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008679}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008680/* --- Decimal Encoder ---------------------------------------------------- */
8681
Alexander Belopolsky40018472011-02-26 01:02:56 +00008682int
8683PyUnicode_EncodeDecimal(Py_UNICODE *s,
8684 Py_ssize_t length,
8685 char *output,
8686 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008687{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008688 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008689 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008690 enum PyUnicode_Kind kind;
8691 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008692
8693 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 PyErr_BadArgument();
8695 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008696 }
8697
Victor Stinner42bf7752011-11-21 22:52:58 +01008698 unicode = PyUnicode_FromUnicode(s, length);
8699 if (unicode == NULL)
8700 return -1;
8701
Benjamin Petersonbac79492012-01-14 13:34:47 -05008702 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008703 Py_DECREF(unicode);
8704 return -1;
8705 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008706 kind = PyUnicode_KIND(unicode);
8707 data = PyUnicode_DATA(unicode);
8708
Victor Stinnerb84d7232011-11-22 01:50:07 +01008709 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008710 PyObject *exc;
8711 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008713 Py_ssize_t startpos;
8714
8715 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008716
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008718 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008719 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008721 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 decimal = Py_UNICODE_TODECIMAL(ch);
8723 if (decimal >= 0) {
8724 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008725 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 continue;
8727 }
8728 if (0 < ch && ch < 256) {
8729 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008730 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 continue;
8732 }
Victor Stinner6345be92011-11-25 20:09:01 +01008733
Victor Stinner42bf7752011-11-21 22:52:58 +01008734 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008735 exc = NULL;
8736 raise_encode_exception(&exc, "decimal", unicode,
8737 startpos, startpos+1,
8738 "invalid decimal Unicode string");
8739 Py_XDECREF(exc);
8740 Py_DECREF(unicode);
8741 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008742 }
8743 /* 0-terminate the output string */
8744 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008745 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008746 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008747}
8748
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749/* --- Helpers ------------------------------------------------------------ */
8750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008752any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 Py_ssize_t start,
8754 Py_ssize_t end)
8755{
8756 int kind1, kind2, kind;
8757 void *buf1, *buf2;
8758 Py_ssize_t len1, len2, result;
8759
8760 kind1 = PyUnicode_KIND(s1);
8761 kind2 = PyUnicode_KIND(s2);
8762 kind = kind1 > kind2 ? kind1 : kind2;
8763 buf1 = PyUnicode_DATA(s1);
8764 buf2 = PyUnicode_DATA(s2);
8765 if (kind1 != kind)
8766 buf1 = _PyUnicode_AsKind(s1, kind);
8767 if (!buf1)
8768 return -2;
8769 if (kind2 != kind)
8770 buf2 = _PyUnicode_AsKind(s2, kind);
8771 if (!buf2) {
8772 if (kind1 != kind) PyMem_Free(buf1);
8773 return -2;
8774 }
8775 len1 = PyUnicode_GET_LENGTH(s1);
8776 len2 = PyUnicode_GET_LENGTH(s2);
8777
Victor Stinner794d5672011-10-10 03:21:36 +02008778 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008779 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008780 case PyUnicode_1BYTE_KIND:
8781 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8782 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8783 else
8784 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8785 break;
8786 case PyUnicode_2BYTE_KIND:
8787 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8788 break;
8789 case PyUnicode_4BYTE_KIND:
8790 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8791 break;
8792 default:
8793 assert(0); result = -2;
8794 }
8795 }
8796 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008797 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008798 case PyUnicode_1BYTE_KIND:
8799 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8800 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8801 else
8802 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8803 break;
8804 case PyUnicode_2BYTE_KIND:
8805 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8806 break;
8807 case PyUnicode_4BYTE_KIND:
8808 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8809 break;
8810 default:
8811 assert(0); result = -2;
8812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 }
8814
8815 if (kind1 != kind)
8816 PyMem_Free(buf1);
8817 if (kind2 != kind)
8818 PyMem_Free(buf2);
8819
8820 return result;
8821}
8822
8823Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008824_PyUnicode_InsertThousandsGrouping(
8825 PyObject *unicode, Py_ssize_t index,
8826 Py_ssize_t n_buffer,
8827 void *digits, Py_ssize_t n_digits,
8828 Py_ssize_t min_width,
8829 const char *grouping, PyObject *thousands_sep,
8830 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831{
Victor Stinner41a863c2012-02-24 00:37:51 +01008832 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008833 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008834 Py_ssize_t thousands_sep_len;
8835 Py_ssize_t len;
8836
8837 if (unicode != NULL) {
8838 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008839 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008840 }
8841 else {
8842 kind = PyUnicode_1BYTE_KIND;
8843 data = NULL;
8844 }
8845 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8846 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8847 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8848 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008849 if (thousands_sep_kind < kind) {
8850 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8851 if (!thousands_sep_data)
8852 return -1;
8853 }
8854 else {
8855 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8856 if (!data)
8857 return -1;
8858 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008859 }
8860
Benjamin Petersonead6b532011-12-20 17:23:42 -06008861 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008863 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008864 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008865 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008866 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008867 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008868 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008869 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008870 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008871 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008872 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008877 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008878 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008879 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008881 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008882 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008883 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008884 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008885 break;
8886 default:
8887 assert(0);
8888 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008890 if (unicode != NULL && thousands_sep_kind != kind) {
8891 if (thousands_sep_kind < kind)
8892 PyMem_Free(thousands_sep_data);
8893 else
8894 PyMem_Free(data);
8895 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008896 if (unicode == NULL) {
8897 *maxchar = 127;
8898 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008899 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008900 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008901 }
8902 }
8903 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904}
8905
8906
Thomas Wouters477c8d52006-05-27 19:21:47 +00008907/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008908#define ADJUST_INDICES(start, end, len) \
8909 if (end > len) \
8910 end = len; \
8911 else if (end < 0) { \
8912 end += len; \
8913 if (end < 0) \
8914 end = 0; \
8915 } \
8916 if (start < 0) { \
8917 start += len; \
8918 if (start < 0) \
8919 start = 0; \
8920 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008921
Alexander Belopolsky40018472011-02-26 01:02:56 +00008922Py_ssize_t
8923PyUnicode_Count(PyObject *str,
8924 PyObject *substr,
8925 Py_ssize_t start,
8926 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008928 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008929 PyObject* str_obj;
8930 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 int kind1, kind2, kind;
8932 void *buf1 = NULL, *buf2 = NULL;
8933 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008934
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008935 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008936 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008938 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008939 if (!sub_obj) {
8940 Py_DECREF(str_obj);
8941 return -1;
8942 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008943 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008944 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 Py_DECREF(str_obj);
8946 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 }
Tim Petersced69f82003-09-16 20:30:58 +00008948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 kind1 = PyUnicode_KIND(str_obj);
8950 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008951 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008954 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008955 if (kind2 > kind) {
8956 Py_DECREF(sub_obj);
8957 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008958 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008959 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008960 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 if (!buf2)
8963 goto onError;
8964 len1 = PyUnicode_GET_LENGTH(str_obj);
8965 len2 = PyUnicode_GET_LENGTH(sub_obj);
8966
8967 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008968 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008970 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8971 result = asciilib_count(
8972 ((Py_UCS1*)buf1) + start, end - start,
8973 buf2, len2, PY_SSIZE_T_MAX
8974 );
8975 else
8976 result = ucs1lib_count(
8977 ((Py_UCS1*)buf1) + start, end - start,
8978 buf2, len2, PY_SSIZE_T_MAX
8979 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 break;
8981 case PyUnicode_2BYTE_KIND:
8982 result = ucs2lib_count(
8983 ((Py_UCS2*)buf1) + start, end - start,
8984 buf2, len2, PY_SSIZE_T_MAX
8985 );
8986 break;
8987 case PyUnicode_4BYTE_KIND:
8988 result = ucs4lib_count(
8989 ((Py_UCS4*)buf1) + start, end - start,
8990 buf2, len2, PY_SSIZE_T_MAX
8991 );
8992 break;
8993 default:
8994 assert(0); result = 0;
8995 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008996
8997 Py_DECREF(sub_obj);
8998 Py_DECREF(str_obj);
8999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 if (kind2 != kind)
9001 PyMem_Free(buf2);
9002
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 onError:
9005 Py_DECREF(sub_obj);
9006 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 if (kind2 != kind && buf2)
9008 PyMem_Free(buf2);
9009 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010}
9011
Alexander Belopolsky40018472011-02-26 01:02:56 +00009012Py_ssize_t
9013PyUnicode_Find(PyObject *str,
9014 PyObject *sub,
9015 Py_ssize_t start,
9016 Py_ssize_t end,
9017 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009019 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009020
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009022 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009024 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009025 if (!sub) {
9026 Py_DECREF(str);
9027 return -2;
9028 }
9029 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9030 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 Py_DECREF(str);
9032 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 }
Tim Petersced69f82003-09-16 20:30:58 +00009034
Victor Stinner794d5672011-10-10 03:21:36 +02009035 result = any_find_slice(direction,
9036 str, sub, start, end
9037 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009038
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009040 Py_DECREF(sub);
9041
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 return result;
9043}
9044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045Py_ssize_t
9046PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9047 Py_ssize_t start, Py_ssize_t end,
9048 int direction)
9049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009051 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (PyUnicode_READY(str) == -1)
9053 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009054 if (start < 0 || end < 0) {
9055 PyErr_SetString(PyExc_IndexError, "string index out of range");
9056 return -2;
9057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 if (end > PyUnicode_GET_LENGTH(str))
9059 end = PyUnicode_GET_LENGTH(str);
9060 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009061 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9062 kind, end-start, ch, direction);
9063 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009065 else
9066 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067}
9068
Alexander Belopolsky40018472011-02-26 01:02:56 +00009069static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009070tailmatch(PyObject *self,
9071 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009072 Py_ssize_t start,
9073 Py_ssize_t end,
9074 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 int kind_self;
9077 int kind_sub;
9078 void *data_self;
9079 void *data_sub;
9080 Py_ssize_t offset;
9081 Py_ssize_t i;
9082 Py_ssize_t end_sub;
9083
9084 if (PyUnicode_READY(self) == -1 ||
9085 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009086 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087
9088 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 return 1;
9090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9092 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 kind_self = PyUnicode_KIND(self);
9097 data_self = PyUnicode_DATA(self);
9098 kind_sub = PyUnicode_KIND(substring);
9099 data_sub = PyUnicode_DATA(substring);
9100 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9101
9102 if (direction > 0)
9103 offset = end;
9104 else
9105 offset = start;
9106
9107 if (PyUnicode_READ(kind_self, data_self, offset) ==
9108 PyUnicode_READ(kind_sub, data_sub, 0) &&
9109 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9110 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9111 /* If both are of the same kind, memcmp is sufficient */
9112 if (kind_self == kind_sub) {
9113 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009114 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 data_sub,
9116 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009117 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 }
9119 /* otherwise we have to compare each character by first accesing it */
9120 else {
9121 /* We do not need to compare 0 and len(substring)-1 because
9122 the if statement above ensured already that they are equal
9123 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 for (i = 1; i < end_sub; ++i) {
9125 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9126 PyUnicode_READ(kind_sub, data_sub, i))
9127 return 0;
9128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131 }
9132
9133 return 0;
9134}
9135
Alexander Belopolsky40018472011-02-26 01:02:56 +00009136Py_ssize_t
9137PyUnicode_Tailmatch(PyObject *str,
9138 PyObject *substr,
9139 Py_ssize_t start,
9140 Py_ssize_t end,
9141 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009143 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009144
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 str = PyUnicode_FromObject(str);
9146 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148 substr = PyUnicode_FromObject(substr);
9149 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 Py_DECREF(str);
9151 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 }
Tim Petersced69f82003-09-16 20:30:58 +00009153
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009154 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156 Py_DECREF(str);
9157 Py_DECREF(substr);
9158 return result;
9159}
9160
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161/* Apply fixfct filter to the Unicode object self and return a
9162 reference to the modified object */
9163
Alexander Belopolsky40018472011-02-26 01:02:56 +00009164static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009165fixup(PyObject *self,
9166 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 PyObject *u;
9169 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009170 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009172 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009175 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 /* fix functions return the new maximum character in a string,
9178 if the kind of the resulting unicode object does not change,
9179 everything is fine. Otherwise we need to change the string kind
9180 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009181 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009182
9183 if (maxchar_new == 0) {
9184 /* no changes */;
9185 if (PyUnicode_CheckExact(self)) {
9186 Py_DECREF(u);
9187 Py_INCREF(self);
9188 return self;
9189 }
9190 else
9191 return u;
9192 }
9193
Victor Stinnere6abb482012-05-02 01:15:40 +02009194 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195
Victor Stinnereaab6042011-12-11 22:22:39 +01009196 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009198
9199 /* In case the maximum character changed, we need to
9200 convert the string to the new category. */
9201 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9202 if (v == NULL) {
9203 Py_DECREF(u);
9204 return NULL;
9205 }
9206 if (maxchar_new > maxchar_old) {
9207 /* If the maxchar increased so that the kind changed, not all
9208 characters are representable anymore and we need to fix the
9209 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009210 _PyUnicode_FastCopyCharacters(v, 0,
9211 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009212 maxchar_old = fixfct(v);
9213 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 }
9215 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009216 _PyUnicode_FastCopyCharacters(v, 0,
9217 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009219 Py_DECREF(u);
9220 assert(_PyUnicode_CheckConsistency(v, 1));
9221 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222}
9223
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009224static PyObject *
9225ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009227 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9228 char *resdata, *data = PyUnicode_DATA(self);
9229 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009230
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009231 res = PyUnicode_New(len, 127);
9232 if (res == NULL)
9233 return NULL;
9234 resdata = PyUnicode_DATA(res);
9235 if (lower)
9236 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009238 _Py_bytes_upper(resdata, data, len);
9239 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240}
9241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009243handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009245 Py_ssize_t j;
9246 int final_sigma;
9247 Py_UCS4 c;
9248 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009249
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009250 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9251
9252 where ! is a negation and \p{xxx} is a character with property xxx.
9253 */
9254 for (j = i - 1; j >= 0; j--) {
9255 c = PyUnicode_READ(kind, data, j);
9256 if (!_PyUnicode_IsCaseIgnorable(c))
9257 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009259 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9260 if (final_sigma) {
9261 for (j = i + 1; j < length; j++) {
9262 c = PyUnicode_READ(kind, data, j);
9263 if (!_PyUnicode_IsCaseIgnorable(c))
9264 break;
9265 }
9266 final_sigma = j == length || !_PyUnicode_IsCased(c);
9267 }
9268 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269}
9270
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009271static int
9272lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9273 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009275 /* Obscure special case. */
9276 if (c == 0x3A3) {
9277 mapped[0] = handle_capital_sigma(kind, data, length, i);
9278 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009280 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281}
9282
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009283static Py_ssize_t
9284do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009286 Py_ssize_t i, k = 0;
9287 int n_res, j;
9288 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009289
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009290 c = PyUnicode_READ(kind, data, 0);
9291 n_res = _PyUnicode_ToUpperFull(c, mapped);
9292 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009293 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009294 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296 for (i = 1; i < length; i++) {
9297 c = PyUnicode_READ(kind, data, i);
9298 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9299 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009300 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009301 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009302 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009303 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009304 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305}
9306
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009307static Py_ssize_t
9308do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9309 Py_ssize_t i, k = 0;
9310
9311 for (i = 0; i < length; i++) {
9312 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9313 int n_res, j;
9314 if (Py_UNICODE_ISUPPER(c)) {
9315 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9316 }
9317 else if (Py_UNICODE_ISLOWER(c)) {
9318 n_res = _PyUnicode_ToUpperFull(c, mapped);
9319 }
9320 else {
9321 n_res = 1;
9322 mapped[0] = c;
9323 }
9324 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009325 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009326 res[k++] = mapped[j];
9327 }
9328 }
9329 return k;
9330}
9331
9332static Py_ssize_t
9333do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9334 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009336 Py_ssize_t i, k = 0;
9337
9338 for (i = 0; i < length; i++) {
9339 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9340 int n_res, j;
9341 if (lower)
9342 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9343 else
9344 n_res = _PyUnicode_ToUpperFull(c, mapped);
9345 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009346 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009347 res[k++] = mapped[j];
9348 }
9349 }
9350 return k;
9351}
9352
9353static Py_ssize_t
9354do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9355{
9356 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9357}
9358
9359static Py_ssize_t
9360do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9361{
9362 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9363}
9364
Benjamin Petersone51757f2012-01-12 21:10:29 -05009365static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009366do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9367{
9368 Py_ssize_t i, k = 0;
9369
9370 for (i = 0; i < length; i++) {
9371 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9372 Py_UCS4 mapped[3];
9373 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9374 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009375 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009376 res[k++] = mapped[j];
9377 }
9378 }
9379 return k;
9380}
9381
9382static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009383do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9384{
9385 Py_ssize_t i, k = 0;
9386 int previous_is_cased;
9387
9388 previous_is_cased = 0;
9389 for (i = 0; i < length; i++) {
9390 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9391 Py_UCS4 mapped[3];
9392 int n_res, j;
9393
9394 if (previous_is_cased)
9395 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9396 else
9397 n_res = _PyUnicode_ToTitleFull(c, mapped);
9398
9399 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009400 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009401 res[k++] = mapped[j];
9402 }
9403
9404 previous_is_cased = _PyUnicode_IsCased(c);
9405 }
9406 return k;
9407}
9408
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009409static PyObject *
9410case_operation(PyObject *self,
9411 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9412{
9413 PyObject *res = NULL;
9414 Py_ssize_t length, newlength = 0;
9415 int kind, outkind;
9416 void *data, *outdata;
9417 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9418
Benjamin Petersoneea48462012-01-16 14:28:50 -05009419 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009420
9421 kind = PyUnicode_KIND(self);
9422 data = PyUnicode_DATA(self);
9423 length = PyUnicode_GET_LENGTH(self);
9424 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9425 if (tmp == NULL)
9426 return PyErr_NoMemory();
9427 newlength = perform(kind, data, length, tmp, &maxchar);
9428 res = PyUnicode_New(newlength, maxchar);
9429 if (res == NULL)
9430 goto leave;
9431 tmpend = tmp + newlength;
9432 outdata = PyUnicode_DATA(res);
9433 outkind = PyUnicode_KIND(res);
9434 switch (outkind) {
9435 case PyUnicode_1BYTE_KIND:
9436 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9437 break;
9438 case PyUnicode_2BYTE_KIND:
9439 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9440 break;
9441 case PyUnicode_4BYTE_KIND:
9442 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9443 break;
9444 default:
9445 assert(0);
9446 break;
9447 }
9448 leave:
9449 PyMem_FREE(tmp);
9450 return res;
9451}
9452
Tim Peters8ce9f162004-08-27 01:49:32 +00009453PyObject *
9454PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009457 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009459 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009460 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9461 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009462 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009464 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009466 int use_memcpy;
9467 unsigned char *res_data = NULL, *sep_data = NULL;
9468 PyObject *last_obj;
9469 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470
Tim Peters05eba1f2004-08-27 21:32:02 +00009471 fseq = PySequence_Fast(seq, "");
9472 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009473 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009474 }
9475
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009476 /* NOTE: the following code can't call back into Python code,
9477 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009478 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009479
Tim Peters05eba1f2004-08-27 21:32:02 +00009480 seqlen = PySequence_Fast_GET_SIZE(fseq);
9481 /* If empty sequence, return u"". */
9482 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009483 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009484 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009485 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009486
Tim Peters05eba1f2004-08-27 21:32:02 +00009487 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009488 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009489 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009490 if (seqlen == 1) {
9491 if (PyUnicode_CheckExact(items[0])) {
9492 res = items[0];
9493 Py_INCREF(res);
9494 Py_DECREF(fseq);
9495 return res;
9496 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009497 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009498 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009499 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009500 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009501 /* Set up sep and seplen */
9502 if (separator == NULL) {
9503 /* fall back to a blank space separator */
9504 sep = PyUnicode_FromOrdinal(' ');
9505 if (!sep)
9506 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009507 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009508 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009509 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009510 else {
9511 if (!PyUnicode_Check(separator)) {
9512 PyErr_Format(PyExc_TypeError,
9513 "separator: expected str instance,"
9514 " %.80s found",
9515 Py_TYPE(separator)->tp_name);
9516 goto onError;
9517 }
9518 if (PyUnicode_READY(separator))
9519 goto onError;
9520 sep = separator;
9521 seplen = PyUnicode_GET_LENGTH(separator);
9522 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9523 /* inc refcount to keep this code path symmetric with the
9524 above case of a blank separator */
9525 Py_INCREF(sep);
9526 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009527 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009528 }
9529
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009530 /* There are at least two things to join, or else we have a subclass
9531 * of str in the sequence.
9532 * Do a pre-pass to figure out the total amount of space we'll
9533 * need (sz), and see whether all argument are strings.
9534 */
9535 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009536#ifdef Py_DEBUG
9537 use_memcpy = 0;
9538#else
9539 use_memcpy = 1;
9540#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009541 for (i = 0; i < seqlen; i++) {
9542 const Py_ssize_t old_sz = sz;
9543 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 if (!PyUnicode_Check(item)) {
9545 PyErr_Format(PyExc_TypeError,
9546 "sequence item %zd: expected str instance,"
9547 " %.80s found",
9548 i, Py_TYPE(item)->tp_name);
9549 goto onError;
9550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 if (PyUnicode_READY(item) == -1)
9552 goto onError;
9553 sz += PyUnicode_GET_LENGTH(item);
9554 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009555 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009556 if (i != 0)
9557 sz += seplen;
9558 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9559 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009560 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009561 goto onError;
9562 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009563 if (use_memcpy && last_obj != NULL) {
9564 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9565 use_memcpy = 0;
9566 }
9567 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009568 }
Tim Petersced69f82003-09-16 20:30:58 +00009569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009571 if (res == NULL)
9572 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009573
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009574 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009575#ifdef Py_DEBUG
9576 use_memcpy = 0;
9577#else
9578 if (use_memcpy) {
9579 res_data = PyUnicode_1BYTE_DATA(res);
9580 kind = PyUnicode_KIND(res);
9581 if (seplen != 0)
9582 sep_data = PyUnicode_1BYTE_DATA(sep);
9583 }
9584#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009585 if (use_memcpy) {
9586 for (i = 0; i < seqlen; ++i) {
9587 Py_ssize_t itemlen;
9588 item = items[i];
9589
9590 /* Copy item, and maybe the separator. */
9591 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009592 Py_MEMCPY(res_data,
9593 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009594 kind * seplen);
9595 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009596 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009597
9598 itemlen = PyUnicode_GET_LENGTH(item);
9599 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009600 Py_MEMCPY(res_data,
9601 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009602 kind * itemlen);
9603 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009604 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009605 }
9606 assert(res_data == PyUnicode_1BYTE_DATA(res)
9607 + kind * PyUnicode_GET_LENGTH(res));
9608 }
9609 else {
9610 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9611 Py_ssize_t itemlen;
9612 item = items[i];
9613
9614 /* Copy item, and maybe the separator. */
9615 if (i && seplen != 0) {
9616 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9617 res_offset += seplen;
9618 }
9619
9620 itemlen = PyUnicode_GET_LENGTH(item);
9621 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009622 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009623 res_offset += itemlen;
9624 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009625 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009626 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009627 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009628
Tim Peters05eba1f2004-08-27 21:32:02 +00009629 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009631 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633
Benjamin Peterson29060642009-01-31 22:14:21 +00009634 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009635 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009637 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 return NULL;
9639}
9640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641#define FILL(kind, data, value, start, length) \
9642 do { \
9643 Py_ssize_t i_ = 0; \
9644 assert(kind != PyUnicode_WCHAR_KIND); \
9645 switch ((kind)) { \
9646 case PyUnicode_1BYTE_KIND: { \
9647 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009648 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 break; \
9650 } \
9651 case PyUnicode_2BYTE_KIND: { \
9652 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9653 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9654 break; \
9655 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009656 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9658 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9659 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009660 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 } \
9662 } \
9663 } while (0)
9664
Victor Stinnerd3f08822012-05-29 12:57:52 +02009665void
9666_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9667 Py_UCS4 fill_char)
9668{
9669 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9670 const void *data = PyUnicode_DATA(unicode);
9671 assert(PyUnicode_IS_READY(unicode));
9672 assert(unicode_modifiable(unicode));
9673 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9674 assert(start >= 0);
9675 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9676 FILL(kind, data, fill_char, start, length);
9677}
9678
Victor Stinner3fe55312012-01-04 00:33:50 +01009679Py_ssize_t
9680PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9681 Py_UCS4 fill_char)
9682{
9683 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009684
9685 if (!PyUnicode_Check(unicode)) {
9686 PyErr_BadInternalCall();
9687 return -1;
9688 }
9689 if (PyUnicode_READY(unicode) == -1)
9690 return -1;
9691 if (unicode_check_modifiable(unicode))
9692 return -1;
9693
Victor Stinnerd3f08822012-05-29 12:57:52 +02009694 if (start < 0) {
9695 PyErr_SetString(PyExc_IndexError, "string index out of range");
9696 return -1;
9697 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009698 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9699 PyErr_SetString(PyExc_ValueError,
9700 "fill character is bigger than "
9701 "the string maximum character");
9702 return -1;
9703 }
9704
9705 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9706 length = Py_MIN(maxlen, length);
9707 if (length <= 0)
9708 return 0;
9709
Victor Stinnerd3f08822012-05-29 12:57:52 +02009710 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009711 return length;
9712}
9713
Victor Stinner9310abb2011-10-05 00:59:23 +02009714static PyObject *
9715pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009716 Py_ssize_t left,
9717 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 PyObject *u;
9721 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009722 int kind;
9723 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724
9725 if (left < 0)
9726 left = 0;
9727 if (right < 0)
9728 right = 0;
9729
Victor Stinnerc4b49542011-12-11 22:44:26 +01009730 if (left == 0 && right == 0)
9731 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9734 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009735 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9736 return NULL;
9737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009739 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009741 if (!u)
9742 return NULL;
9743
9744 kind = PyUnicode_KIND(u);
9745 data = PyUnicode_DATA(u);
9746 if (left)
9747 FILL(kind, data, fill, 0, left);
9748 if (right)
9749 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009750 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009751 assert(_PyUnicode_CheckConsistency(u, 1));
9752 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753}
9754
Alexander Belopolsky40018472011-02-26 01:02:56 +00009755PyObject *
9756PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759
9760 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009761 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009762 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009763 if (PyUnicode_READY(string) == -1) {
9764 Py_DECREF(string);
9765 return NULL;
9766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767
Benjamin Petersonead6b532011-12-20 17:23:42 -06009768 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009770 if (PyUnicode_IS_ASCII(string))
9771 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009772 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009773 PyUnicode_GET_LENGTH(string), keepends);
9774 else
9775 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009776 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009777 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 break;
9779 case PyUnicode_2BYTE_KIND:
9780 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009781 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 PyUnicode_GET_LENGTH(string), keepends);
9783 break;
9784 case PyUnicode_4BYTE_KIND:
9785 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009786 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 PyUnicode_GET_LENGTH(string), keepends);
9788 break;
9789 default:
9790 assert(0);
9791 list = 0;
9792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793 Py_DECREF(string);
9794 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795}
9796
Alexander Belopolsky40018472011-02-26 01:02:56 +00009797static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009798split(PyObject *self,
9799 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009800 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 int kind1, kind2, kind;
9803 void *buf1, *buf2;
9804 Py_ssize_t len1, len2;
9805 PyObject* out;
9806
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009808 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 if (PyUnicode_READY(self) == -1)
9811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009814 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009816 if (PyUnicode_IS_ASCII(self))
9817 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009818 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009819 PyUnicode_GET_LENGTH(self), maxcount
9820 );
9821 else
9822 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009823 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009824 PyUnicode_GET_LENGTH(self), maxcount
9825 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 case PyUnicode_2BYTE_KIND:
9827 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009828 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 PyUnicode_GET_LENGTH(self), maxcount
9830 );
9831 case PyUnicode_4BYTE_KIND:
9832 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 PyUnicode_GET_LENGTH(self), maxcount
9835 );
9836 default:
9837 assert(0);
9838 return NULL;
9839 }
9840
9841 if (PyUnicode_READY(substring) == -1)
9842 return NULL;
9843
9844 kind1 = PyUnicode_KIND(self);
9845 kind2 = PyUnicode_KIND(substring);
9846 kind = kind1 > kind2 ? kind1 : kind2;
9847 buf1 = PyUnicode_DATA(self);
9848 buf2 = PyUnicode_DATA(substring);
9849 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009850 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 if (!buf1)
9852 return NULL;
9853 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009854 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 if (!buf2) {
9856 if (kind1 != kind) PyMem_Free(buf1);
9857 return NULL;
9858 }
9859 len1 = PyUnicode_GET_LENGTH(self);
9860 len2 = PyUnicode_GET_LENGTH(substring);
9861
Benjamin Petersonead6b532011-12-20 17:23:42 -06009862 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009864 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9865 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009866 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009867 else
9868 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009869 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 break;
9871 case PyUnicode_2BYTE_KIND:
9872 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009873 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 break;
9875 case PyUnicode_4BYTE_KIND:
9876 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009877 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 break;
9879 default:
9880 out = NULL;
9881 }
9882 if (kind1 != kind)
9883 PyMem_Free(buf1);
9884 if (kind2 != kind)
9885 PyMem_Free(buf2);
9886 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887}
9888
Alexander Belopolsky40018472011-02-26 01:02:56 +00009889static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009890rsplit(PyObject *self,
9891 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009892 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 int kind1, kind2, kind;
9895 void *buf1, *buf2;
9896 Py_ssize_t len1, len2;
9897 PyObject* out;
9898
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009899 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009900 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 if (PyUnicode_READY(self) == -1)
9903 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009906 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009908 if (PyUnicode_IS_ASCII(self))
9909 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009910 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009911 PyUnicode_GET_LENGTH(self), maxcount
9912 );
9913 else
9914 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009915 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009916 PyUnicode_GET_LENGTH(self), maxcount
9917 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 case PyUnicode_2BYTE_KIND:
9919 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009920 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 PyUnicode_GET_LENGTH(self), maxcount
9922 );
9923 case PyUnicode_4BYTE_KIND:
9924 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 PyUnicode_GET_LENGTH(self), maxcount
9927 );
9928 default:
9929 assert(0);
9930 return NULL;
9931 }
9932
9933 if (PyUnicode_READY(substring) == -1)
9934 return NULL;
9935
9936 kind1 = PyUnicode_KIND(self);
9937 kind2 = PyUnicode_KIND(substring);
9938 kind = kind1 > kind2 ? kind1 : kind2;
9939 buf1 = PyUnicode_DATA(self);
9940 buf2 = PyUnicode_DATA(substring);
9941 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009942 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 if (!buf1)
9944 return NULL;
9945 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009946 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 if (!buf2) {
9948 if (kind1 != kind) PyMem_Free(buf1);
9949 return NULL;
9950 }
9951 len1 = PyUnicode_GET_LENGTH(self);
9952 len2 = PyUnicode_GET_LENGTH(substring);
9953
Benjamin Petersonead6b532011-12-20 17:23:42 -06009954 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009956 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9957 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009958 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009959 else
9960 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009961 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 break;
9963 case PyUnicode_2BYTE_KIND:
9964 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009965 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 break;
9967 case PyUnicode_4BYTE_KIND:
9968 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009969 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 break;
9971 default:
9972 out = NULL;
9973 }
9974 if (kind1 != kind)
9975 PyMem_Free(buf1);
9976 if (kind2 != kind)
9977 PyMem_Free(buf2);
9978 return out;
9979}
9980
9981static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9983 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009985 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009987 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9988 return asciilib_find(buf1, len1, buf2, len2, offset);
9989 else
9990 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 case PyUnicode_2BYTE_KIND:
9992 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9993 case PyUnicode_4BYTE_KIND:
9994 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9995 }
9996 assert(0);
9997 return -1;
9998}
9999
10000static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010001anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10002 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010004 switch (kind) {
10005 case PyUnicode_1BYTE_KIND:
10006 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10007 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10008 else
10009 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10010 case PyUnicode_2BYTE_KIND:
10011 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10012 case PyUnicode_4BYTE_KIND:
10013 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10014 }
10015 assert(0);
10016 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010017}
10018
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010019static void
10020replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10021 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10022{
10023 int kind = PyUnicode_KIND(u);
10024 void *data = PyUnicode_DATA(u);
10025 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10026 if (kind == PyUnicode_1BYTE_KIND) {
10027 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10028 (Py_UCS1 *)data + len,
10029 u1, u2, maxcount);
10030 }
10031 else if (kind == PyUnicode_2BYTE_KIND) {
10032 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10033 (Py_UCS2 *)data + len,
10034 u1, u2, maxcount);
10035 }
10036 else {
10037 assert(kind == PyUnicode_4BYTE_KIND);
10038 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10039 (Py_UCS4 *)data + len,
10040 u1, u2, maxcount);
10041 }
10042}
10043
Alexander Belopolsky40018472011-02-26 01:02:56 +000010044static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045replace(PyObject *self, PyObject *str1,
10046 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 PyObject *u;
10049 char *sbuf = PyUnicode_DATA(self);
10050 char *buf1 = PyUnicode_DATA(str1);
10051 char *buf2 = PyUnicode_DATA(str2);
10052 int srelease = 0, release1 = 0, release2 = 0;
10053 int skind = PyUnicode_KIND(self);
10054 int kind1 = PyUnicode_KIND(str1);
10055 int kind2 = PyUnicode_KIND(str2);
10056 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10057 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10058 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010059 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010060 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061
10062 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010063 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010065 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066
Victor Stinner59de0ee2011-10-07 10:01:28 +020010067 if (str1 == str2)
10068 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069
Victor Stinner49a0a212011-10-12 23:46:10 +020010070 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010071 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10072 if (maxchar < maxchar_str1)
10073 /* substring too wide to be present */
10074 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010075 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10076 /* Replacing str1 with str2 may cause a maxchar reduction in the
10077 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010078 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010079 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010082 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010084 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010086 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010087 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010088 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010089
Victor Stinner69ed0f42013-04-09 21:48:24 +020010090 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010091 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010092 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010093 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010094 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010096 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010098
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010099 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10100 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010101 }
10102 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 int rkind = skind;
10104 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010105 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 if (kind1 < rkind) {
10108 /* widen substring */
10109 buf1 = _PyUnicode_AsKind(str1, rkind);
10110 if (!buf1) goto error;
10111 release1 = 1;
10112 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010113 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010114 if (i < 0)
10115 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (rkind > kind2) {
10117 /* widen replacement */
10118 buf2 = _PyUnicode_AsKind(str2, rkind);
10119 if (!buf2) goto error;
10120 release2 = 1;
10121 }
10122 else if (rkind < kind2) {
10123 /* widen self and buf1 */
10124 rkind = kind2;
10125 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010126 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 sbuf = _PyUnicode_AsKind(self, rkind);
10128 if (!sbuf) goto error;
10129 srelease = 1;
10130 buf1 = _PyUnicode_AsKind(str1, rkind);
10131 if (!buf1) goto error;
10132 release1 = 1;
10133 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010134 u = PyUnicode_New(slen, maxchar);
10135 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010137 assert(PyUnicode_KIND(u) == rkind);
10138 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010139
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010140 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010141 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010142 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010144 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010146
10147 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010148 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010149 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010151 if (i == -1)
10152 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010153 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010155 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 }
10160 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010162 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 int rkind = skind;
10164 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010167 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 buf1 = _PyUnicode_AsKind(str1, rkind);
10169 if (!buf1) goto error;
10170 release1 = 1;
10171 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010172 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010173 if (n == 0)
10174 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010176 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 buf2 = _PyUnicode_AsKind(str2, rkind);
10178 if (!buf2) goto error;
10179 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010182 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 rkind = kind2;
10184 sbuf = _PyUnicode_AsKind(self, rkind);
10185 if (!sbuf) goto error;
10186 srelease = 1;
10187 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010188 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 buf1 = _PyUnicode_AsKind(str1, rkind);
10190 if (!buf1) goto error;
10191 release1 = 1;
10192 }
10193 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10194 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010195 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 PyErr_SetString(PyExc_OverflowError,
10197 "replace string is too long");
10198 goto error;
10199 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010200 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010201 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010202 _Py_INCREF_UNICODE_EMPTY();
10203 if (!unicode_empty)
10204 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010205 u = unicode_empty;
10206 goto done;
10207 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010208 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 PyErr_SetString(PyExc_OverflowError,
10210 "replace string is too long");
10211 goto error;
10212 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010213 u = PyUnicode_New(new_size, maxchar);
10214 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010216 assert(PyUnicode_KIND(u) == rkind);
10217 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 ires = i = 0;
10219 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220 while (n-- > 0) {
10221 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010222 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010223 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010225 if (j == -1)
10226 break;
10227 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010228 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010229 memcpy(res + rkind * ires,
10230 sbuf + rkind * i,
10231 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010233 }
10234 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010236 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010238 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010245 memcpy(res + rkind * ires,
10246 sbuf + rkind * i,
10247 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010248 }
10249 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010250 /* interleave */
10251 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010252 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010254 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010256 if (--n <= 0)
10257 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010258 memcpy(res + rkind * ires,
10259 sbuf + rkind * i,
10260 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 ires++;
10262 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010263 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010264 memcpy(res + rkind * ires,
10265 sbuf + rkind * i,
10266 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010267 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010268 }
10269
10270 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010271 unicode_adjust_maxchar(&u);
10272 if (u == NULL)
10273 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010275
10276 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (srelease)
10278 PyMem_FREE(sbuf);
10279 if (release1)
10280 PyMem_FREE(buf1);
10281 if (release2)
10282 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010283 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010285
Benjamin Peterson29060642009-01-31 22:14:21 +000010286 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010287 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 if (srelease)
10289 PyMem_FREE(sbuf);
10290 if (release1)
10291 PyMem_FREE(buf1);
10292 if (release2)
10293 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010294 return unicode_result_unchanged(self);
10295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 error:
10297 if (srelease && sbuf)
10298 PyMem_FREE(sbuf);
10299 if (release1 && buf1)
10300 PyMem_FREE(buf1);
10301 if (release2 && buf2)
10302 PyMem_FREE(buf2);
10303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304}
10305
10306/* --- Unicode Object Methods --------------------------------------------- */
10307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010308PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310\n\
10311Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010312characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
10314static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010315unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010317 if (PyUnicode_READY(self) == -1)
10318 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010319 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320}
10321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010322PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010323 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324\n\
10325Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010326have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327
10328static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010329unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010331 if (PyUnicode_READY(self) == -1)
10332 return NULL;
10333 if (PyUnicode_GET_LENGTH(self) == 0)
10334 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010335 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336}
10337
Benjamin Petersond5890c82012-01-14 13:23:30 -050010338PyDoc_STRVAR(casefold__doc__,
10339 "S.casefold() -> str\n\
10340\n\
10341Return a version of S suitable for caseless comparisons.");
10342
10343static PyObject *
10344unicode_casefold(PyObject *self)
10345{
10346 if (PyUnicode_READY(self) == -1)
10347 return NULL;
10348 if (PyUnicode_IS_ASCII(self))
10349 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010350 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010351}
10352
10353
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010354/* Argument converter. Coerces to a single unicode character */
10355
10356static int
10357convert_uc(PyObject *obj, void *addr)
10358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010360 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010361
Benjamin Peterson14339b62009-01-31 16:36:08 +000010362 uniobj = PyUnicode_FromObject(obj);
10363 if (uniobj == NULL) {
10364 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010365 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010366 return 0;
10367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010369 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010370 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010371 Py_DECREF(uniobj);
10372 return 0;
10373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010375 Py_DECREF(uniobj);
10376 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010377}
10378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010379PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010380 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010382Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010383done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384
10385static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010386unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010388 Py_ssize_t marg, left;
10389 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 Py_UCS4 fillchar = ' ';
10391
Victor Stinnere9a29352011-10-01 02:14:59 +020010392 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394
Benjamin Petersonbac79492012-01-14 13:34:47 -050010395 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396 return NULL;
10397
Victor Stinnerc4b49542011-12-11 22:44:26 +010010398 if (PyUnicode_GET_LENGTH(self) >= width)
10399 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400
Victor Stinnerc4b49542011-12-11 22:44:26 +010010401 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402 left = marg / 2 + (marg & width & 1);
10403
Victor Stinner9310abb2011-10-05 00:59:23 +020010404 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405}
10406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407/* This function assumes that str1 and str2 are readied by the caller. */
10408
Marc-André Lemburge5034372000-08-08 08:04:29 +000010409static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010410unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010411{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010412#define COMPARE(TYPE1, TYPE2) \
10413 do { \
10414 TYPE1* p1 = (TYPE1 *)data1; \
10415 TYPE2* p2 = (TYPE2 *)data2; \
10416 TYPE1* end = p1 + len; \
10417 Py_UCS4 c1, c2; \
10418 for (; p1 != end; p1++, p2++) { \
10419 c1 = *p1; \
10420 c2 = *p2; \
10421 if (c1 != c2) \
10422 return (c1 < c2) ? -1 : 1; \
10423 } \
10424 } \
10425 while (0)
10426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 int kind1, kind2;
10428 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010429 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010430
Victor Stinner90db9c42012-10-04 21:53:50 +020010431 /* a string is equal to itself */
10432 if (str1 == str2)
10433 return 0;
10434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 kind1 = PyUnicode_KIND(str1);
10436 kind2 = PyUnicode_KIND(str2);
10437 data1 = PyUnicode_DATA(str1);
10438 data2 = PyUnicode_DATA(str2);
10439 len1 = PyUnicode_GET_LENGTH(str1);
10440 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010441 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010442
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010443 switch(kind1) {
10444 case PyUnicode_1BYTE_KIND:
10445 {
10446 switch(kind2) {
10447 case PyUnicode_1BYTE_KIND:
10448 {
10449 int cmp = memcmp(data1, data2, len);
10450 /* normalize result of memcmp() into the range [-1; 1] */
10451 if (cmp < 0)
10452 return -1;
10453 if (cmp > 0)
10454 return 1;
10455 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010456 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010457 case PyUnicode_2BYTE_KIND:
10458 COMPARE(Py_UCS1, Py_UCS2);
10459 break;
10460 case PyUnicode_4BYTE_KIND:
10461 COMPARE(Py_UCS1, Py_UCS4);
10462 break;
10463 default:
10464 assert(0);
10465 }
10466 break;
10467 }
10468 case PyUnicode_2BYTE_KIND:
10469 {
10470 switch(kind2) {
10471 case PyUnicode_1BYTE_KIND:
10472 COMPARE(Py_UCS2, Py_UCS1);
10473 break;
10474 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010475 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010476 COMPARE(Py_UCS2, Py_UCS2);
10477 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010478 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010479 case PyUnicode_4BYTE_KIND:
10480 COMPARE(Py_UCS2, Py_UCS4);
10481 break;
10482 default:
10483 assert(0);
10484 }
10485 break;
10486 }
10487 case PyUnicode_4BYTE_KIND:
10488 {
10489 switch(kind2) {
10490 case PyUnicode_1BYTE_KIND:
10491 COMPARE(Py_UCS4, Py_UCS1);
10492 break;
10493 case PyUnicode_2BYTE_KIND:
10494 COMPARE(Py_UCS4, Py_UCS2);
10495 break;
10496 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010497 {
10498#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10499 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10500 /* normalize result of wmemcmp() into the range [-1; 1] */
10501 if (cmp < 0)
10502 return -1;
10503 if (cmp > 0)
10504 return 1;
10505#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010506 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010507#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010508 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010509 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010510 default:
10511 assert(0);
10512 }
10513 break;
10514 }
10515 default:
10516 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010517 }
10518
Victor Stinner770e19e2012-10-04 22:59:45 +020010519 if (len1 == len2)
10520 return 0;
10521 if (len1 < len2)
10522 return -1;
10523 else
10524 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010525
10526#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010527}
10528
Victor Stinnere5567ad2012-10-23 02:48:49 +020010529static int
10530unicode_compare_eq(PyObject *str1, PyObject *str2)
10531{
10532 int kind;
10533 void *data1, *data2;
10534 Py_ssize_t len;
10535 int cmp;
10536
10537 /* a string is equal to itself */
10538 if (str1 == str2)
10539 return 1;
10540
10541 len = PyUnicode_GET_LENGTH(str1);
10542 if (PyUnicode_GET_LENGTH(str2) != len)
10543 return 0;
10544 kind = PyUnicode_KIND(str1);
10545 if (PyUnicode_KIND(str2) != kind)
10546 return 0;
10547 data1 = PyUnicode_DATA(str1);
10548 data2 = PyUnicode_DATA(str2);
10549
10550 cmp = memcmp(data1, data2, len * kind);
10551 return (cmp == 0);
10552}
10553
10554
Alexander Belopolsky40018472011-02-26 01:02:56 +000010555int
10556PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10559 if (PyUnicode_READY(left) == -1 ||
10560 PyUnicode_READY(right) == -1)
10561 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010562 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010564 PyErr_Format(PyExc_TypeError,
10565 "Can't compare %.100s and %.100s",
10566 left->ob_type->tp_name,
10567 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568 return -1;
10569}
10570
Martin v. Löwis5b222132007-06-10 09:51:05 +000010571int
10572PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10573{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 Py_ssize_t i;
10575 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 Py_UCS4 chr;
10577
Victor Stinner910337b2011-10-03 03:20:16 +020010578 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (PyUnicode_READY(uni) == -1)
10580 return -1;
10581 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010582 if (kind == PyUnicode_1BYTE_KIND) {
10583 char *data = PyUnicode_1BYTE_DATA(uni);
10584 Py_ssize_t len1 = PyUnicode_GET_LENGTH(uni);
10585 size_t len, len2 = strlen(str);
10586 int cmp;
10587
10588 len = Py_MIN(len1, len2);
10589 cmp = memcmp(data, str, len);
10590 if (cmp != 0)
10591 return cmp;
10592 if (len1 > len2)
10593 return 1; /* uni is longer */
10594 if (len2 > len1)
10595 return -1; /* str is longer */
10596 return 0;
10597 }
10598 else {
10599 void *data = PyUnicode_DATA(uni);
10600 /* Compare Unicode string and source character set string */
10601 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10602 if (chr != str[i])
10603 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10604 /* This check keeps Python strings that end in '\0' from comparing equal
10605 to C strings identical up to that point. */
10606 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10607 return 1; /* uni is longer */
10608 if (str[i])
10609 return -1; /* str is longer */
10610 return 0;
10611 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010612}
10613
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010614
Benjamin Peterson29060642009-01-31 22:14:21 +000010615#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010616 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010617
Alexander Belopolsky40018472011-02-26 01:02:56 +000010618PyObject *
10619PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010620{
10621 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010622 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010623
Victor Stinnere5567ad2012-10-23 02:48:49 +020010624 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10625 Py_RETURN_NOTIMPLEMENTED;
10626
10627 if (PyUnicode_READY(left) == -1 ||
10628 PyUnicode_READY(right) == -1)
10629 return NULL;
10630
10631 if (op == Py_EQ || op == Py_NE) {
10632 result = unicode_compare_eq(left, right);
10633 if (op == Py_EQ)
10634 v = TEST_COND(result);
10635 else
10636 v = TEST_COND(!result);
10637 }
10638 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010639 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010640
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010641 /* Convert the return value to a Boolean */
10642 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010643 case Py_LE:
10644 v = TEST_COND(result <= 0);
10645 break;
10646 case Py_GE:
10647 v = TEST_COND(result >= 0);
10648 break;
10649 case Py_LT:
10650 v = TEST_COND(result == -1);
10651 break;
10652 case Py_GT:
10653 v = TEST_COND(result == 1);
10654 break;
10655 default:
10656 PyErr_BadArgument();
10657 return NULL;
10658 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010659 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010660 Py_INCREF(v);
10661 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010662}
10663
Alexander Belopolsky40018472011-02-26 01:02:56 +000010664int
10665PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010666{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010668 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 void *buf1, *buf2;
10670 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010671 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010672
10673 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010674 sub = PyUnicode_FromObject(element);
10675 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010676 PyErr_Format(PyExc_TypeError,
10677 "'in <string>' requires string as left operand, not %s",
10678 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010680 }
10681
Thomas Wouters477c8d52006-05-27 19:21:47 +000010682 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010683 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 Py_DECREF(sub);
10685 return -1;
10686 }
10687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 kind1 = PyUnicode_KIND(str);
10689 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 buf1 = PyUnicode_DATA(str);
10691 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010692 if (kind2 != kind1) {
10693 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010694 Py_DECREF(sub);
10695 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010696 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010697 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010698 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 if (!buf2) {
10701 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010702 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 return -1;
10704 }
10705 len1 = PyUnicode_GET_LENGTH(str);
10706 len2 = PyUnicode_GET_LENGTH(sub);
10707
Victor Stinner77282cb2013-04-14 19:22:47 +020010708 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 case PyUnicode_1BYTE_KIND:
10710 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10711 break;
10712 case PyUnicode_2BYTE_KIND:
10713 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10714 break;
10715 case PyUnicode_4BYTE_KIND:
10716 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10717 break;
10718 default:
10719 result = -1;
10720 assert(0);
10721 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722
10723 Py_DECREF(str);
10724 Py_DECREF(sub);
10725
Victor Stinner77282cb2013-04-14 19:22:47 +020010726 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 PyMem_Free(buf2);
10728
Guido van Rossum403d68b2000-03-13 15:55:09 +000010729 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010730}
10731
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732/* Concat to string or Unicode object giving a new Unicode object. */
10733
Alexander Belopolsky40018472011-02-26 01:02:56 +000010734PyObject *
10735PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010738 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010739 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
10741 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010744 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748
10749 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010750 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010751 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010754 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010755 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757 }
10758
Victor Stinner488fa492011-12-12 00:01:39 +010010759 u_len = PyUnicode_GET_LENGTH(u);
10760 v_len = PyUnicode_GET_LENGTH(v);
10761 if (u_len > PY_SSIZE_T_MAX - v_len) {
10762 PyErr_SetString(PyExc_OverflowError,
10763 "strings are too large to concat");
10764 goto onError;
10765 }
10766 new_len = u_len + v_len;
10767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010769 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010770 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010773 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010775 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010776 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10777 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778 Py_DECREF(u);
10779 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010780 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784 Py_XDECREF(u);
10785 Py_XDECREF(v);
10786 return NULL;
10787}
10788
Walter Dörwald1ab83302007-05-18 17:15:44 +000010789void
Victor Stinner23e56682011-10-03 03:54:37 +020010790PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010791{
Victor Stinner23e56682011-10-03 03:54:37 +020010792 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010793 Py_UCS4 maxchar, maxchar2;
10794 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010795
10796 if (p_left == NULL) {
10797 if (!PyErr_Occurred())
10798 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010799 return;
10800 }
Victor Stinner23e56682011-10-03 03:54:37 +020010801 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020010802 if (right == NULL || left == NULL
10803 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010804 if (!PyErr_Occurred())
10805 PyErr_BadInternalCall();
10806 goto error;
10807 }
10808
Benjamin Petersonbac79492012-01-14 13:34:47 -050010809 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010810 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010811 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010812 goto error;
10813
Victor Stinner488fa492011-12-12 00:01:39 +010010814 /* Shortcuts */
10815 if (left == unicode_empty) {
10816 Py_DECREF(left);
10817 Py_INCREF(right);
10818 *p_left = right;
10819 return;
10820 }
10821 if (right == unicode_empty)
10822 return;
10823
10824 left_len = PyUnicode_GET_LENGTH(left);
10825 right_len = PyUnicode_GET_LENGTH(right);
10826 if (left_len > PY_SSIZE_T_MAX - right_len) {
10827 PyErr_SetString(PyExc_OverflowError,
10828 "strings are too large to concat");
10829 goto error;
10830 }
10831 new_len = left_len + right_len;
10832
10833 if (unicode_modifiable(left)
10834 && PyUnicode_CheckExact(right)
10835 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010836 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10837 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010838 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010839 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010840 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10841 {
10842 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010843 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010010844 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020010845
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010846 /* copy 'right' into the newly allocated area of 'left' */
10847 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010848 }
Victor Stinner488fa492011-12-12 00:01:39 +010010849 else {
10850 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10851 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010852 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010853
Victor Stinner488fa492011-12-12 00:01:39 +010010854 /* Concat the two Unicode strings */
10855 res = PyUnicode_New(new_len, maxchar);
10856 if (res == NULL)
10857 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010858 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10859 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010860 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010861 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010010862 }
10863 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010864 return;
10865
10866error:
Victor Stinner488fa492011-12-12 00:01:39 +010010867 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010868}
10869
10870void
10871PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10872{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010873 PyUnicode_Append(pleft, right);
10874 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010875}
10876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010877PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010880Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010881string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010882interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883
10884static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010885unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010887 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010888 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010889 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 int kind1, kind2, kind;
10892 void *buf1, *buf2;
10893 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894
Jesus Ceaac451502011-04-20 17:09:23 +020010895 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10896 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010897 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 kind1 = PyUnicode_KIND(self);
10900 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020010901 if (kind2 > kind1) {
10902 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010903 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020010904 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010905 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 buf1 = PyUnicode_DATA(self);
10907 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010909 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 if (!buf2) {
10911 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 return NULL;
10913 }
10914 len1 = PyUnicode_GET_LENGTH(self);
10915 len2 = PyUnicode_GET_LENGTH(substring);
10916
10917 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010918 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 case PyUnicode_1BYTE_KIND:
10920 iresult = ucs1lib_count(
10921 ((Py_UCS1*)buf1) + start, end - start,
10922 buf2, len2, PY_SSIZE_T_MAX
10923 );
10924 break;
10925 case PyUnicode_2BYTE_KIND:
10926 iresult = ucs2lib_count(
10927 ((Py_UCS2*)buf1) + start, end - start,
10928 buf2, len2, PY_SSIZE_T_MAX
10929 );
10930 break;
10931 case PyUnicode_4BYTE_KIND:
10932 iresult = ucs4lib_count(
10933 ((Py_UCS4*)buf1) + start, end - start,
10934 buf2, len2, PY_SSIZE_T_MAX
10935 );
10936 break;
10937 default:
10938 assert(0); iresult = 0;
10939 }
10940
10941 result = PyLong_FromSsize_t(iresult);
10942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 if (kind2 != kind)
10944 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945
10946 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010947
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 return result;
10949}
10950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010951PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010952 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010954Encode S using the codec registered for encoding. Default encoding\n\
10955is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010956handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010957a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10958'xmlcharrefreplace' as well as any other name registered with\n\
10959codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
10961static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010962unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010964 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965 char *encoding = NULL;
10966 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010967
Benjamin Peterson308d6372009-09-18 21:42:35 +000010968 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10969 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010971 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010972}
10973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010974PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976\n\
10977Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010978If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979
10980static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010981unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010983 Py_ssize_t i, j, line_pos, src_len, incr;
10984 Py_UCS4 ch;
10985 PyObject *u;
10986 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010988 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010989 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990
10991 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
Antoine Pitrou22425222011-10-04 19:10:51 +020010994 if (PyUnicode_READY(self) == -1)
10995 return NULL;
10996
Thomas Wouters7e474022000-07-16 12:04:32 +000010997 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010998 src_len = PyUnicode_GET_LENGTH(self);
10999 i = j = line_pos = 0;
11000 kind = PyUnicode_KIND(self);
11001 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011002 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011003 for (; i < src_len; i++) {
11004 ch = PyUnicode_READ(kind, src_data, i);
11005 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011006 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011008 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011010 goto overflow;
11011 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011013 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011017 goto overflow;
11018 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011020 if (ch == '\n' || ch == '\r')
11021 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011023 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011024 if (!found)
11025 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011026
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011028 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 if (!u)
11030 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011031 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032
Antoine Pitroue71d5742011-10-04 15:55:09 +020011033 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
Antoine Pitroue71d5742011-10-04 15:55:09 +020011035 for (; i < src_len; i++) {
11036 ch = PyUnicode_READ(kind, src_data, i);
11037 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011038 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011039 incr = tabsize - (line_pos % tabsize);
11040 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011041 FILL(kind, dest_data, ' ', j, incr);
11042 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011043 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011044 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011045 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011046 line_pos++;
11047 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011048 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011049 if (ch == '\n' || ch == '\r')
11050 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011052 }
11053 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011054 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011055
Antoine Pitroue71d5742011-10-04 15:55:09 +020011056 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011057 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059}
11060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011061PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011062 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063\n\
11064Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011065such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066arguments start and end are interpreted as in slice notation.\n\
11067\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011068Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
11070static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011073 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011074 Py_ssize_t start;
11075 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011076 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
Jesus Ceaac451502011-04-20 17:09:23 +020011078 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11079 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081
Christian Heimesd47802e2013-06-29 21:33:36 +020011082 if (PyUnicode_READY(self) == -1) {
11083 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011085 }
11086 if (PyUnicode_READY(substring) == -1) {
11087 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090
Victor Stinner7931d9a2011-11-04 00:22:48 +010011091 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092
11093 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 if (result == -2)
11096 return NULL;
11097
Christian Heimes217cfd12007-12-02 14:31:20 +000011098 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099}
11100
11101static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011102unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011104 void *data;
11105 enum PyUnicode_Kind kind;
11106 Py_UCS4 ch;
11107 PyObject *res;
11108
11109 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11110 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011112 }
11113 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11114 PyErr_SetString(PyExc_IndexError, "string index out of range");
11115 return NULL;
11116 }
11117 kind = PyUnicode_KIND(self);
11118 data = PyUnicode_DATA(self);
11119 ch = PyUnicode_READ(kind, data, index);
11120 if (ch < 256)
11121 return get_latin1_char(ch);
11122
11123 res = PyUnicode_New(1, ch);
11124 if (res == NULL)
11125 return NULL;
11126 kind = PyUnicode_KIND(res);
11127 data = PyUnicode_DATA(res);
11128 PyUnicode_WRITE(kind, data, 0, ch);
11129 assert(_PyUnicode_CheckConsistency(res, 1));
11130 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131}
11132
Guido van Rossumc2504932007-09-18 19:42:40 +000011133/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011134 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011135static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011136unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137{
Guido van Rossumc2504932007-09-18 19:42:40 +000011138 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011139 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011140
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011141#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011142 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011143#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 if (_PyUnicode_HASH(self) != -1)
11145 return _PyUnicode_HASH(self);
11146 if (PyUnicode_READY(self) == -1)
11147 return -1;
11148 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011149 /*
11150 We make the hash of the empty string be 0, rather than using
11151 (prefix ^ suffix), since this slightly obfuscates the hash secret
11152 */
11153 if (len == 0) {
11154 _PyUnicode_HASH(self) = 0;
11155 return 0;
11156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157
11158 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011159#define HASH(P) \
11160 x ^= (Py_uhash_t) *P << 7; \
11161 while (--len >= 0) \
11162 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163
Georg Brandl2fb477c2012-02-21 00:33:36 +010011164 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 switch (PyUnicode_KIND(self)) {
11166 case PyUnicode_1BYTE_KIND: {
11167 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11168 HASH(c);
11169 break;
11170 }
11171 case PyUnicode_2BYTE_KIND: {
11172 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11173 HASH(s);
11174 break;
11175 }
11176 default: {
11177 Py_UCS4 *l;
11178 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11179 "Impossible switch case in unicode_hash");
11180 l = PyUnicode_4BYTE_DATA(self);
11181 HASH(l);
11182 break;
11183 }
11184 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011185 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11186 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187
Guido van Rossumc2504932007-09-18 19:42:40 +000011188 if (x == -1)
11189 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011191 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011195PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011196 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011198Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
11200static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011203 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011204 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011205 Py_ssize_t start;
11206 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
Jesus Ceaac451502011-04-20 17:09:23 +020011208 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11209 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211
Christian Heimesd47a0452013-06-29 21:21:37 +020011212 if (PyUnicode_READY(self) == -1) {
11213 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011215 }
11216 if (PyUnicode_READY(substring) == -1) {
11217 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220
Victor Stinner7931d9a2011-11-04 00:22:48 +010011221 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222
11223 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 if (result == -2)
11226 return NULL;
11227
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 if (result < 0) {
11229 PyErr_SetString(PyExc_ValueError, "substring not found");
11230 return NULL;
11231 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011232
Christian Heimes217cfd12007-12-02 14:31:20 +000011233 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234}
11235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011239Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241
11242static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011243unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 Py_ssize_t i, length;
11246 int kind;
11247 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248 int cased;
11249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 if (PyUnicode_READY(self) == -1)
11251 return NULL;
11252 length = PyUnicode_GET_LENGTH(self);
11253 kind = PyUnicode_KIND(self);
11254 data = PyUnicode_DATA(self);
11255
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 if (length == 1)
11258 return PyBool_FromLong(
11259 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011261 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011264
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 for (i = 0; i < length; i++) {
11267 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011268
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11270 return PyBool_FromLong(0);
11271 else if (!cased && Py_UNICODE_ISLOWER(ch))
11272 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011274 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275}
11276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011277PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011280Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011281at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282
11283static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011284unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 Py_ssize_t i, length;
11287 int kind;
11288 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 int cased;
11290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 if (PyUnicode_READY(self) == -1)
11292 return NULL;
11293 length = PyUnicode_GET_LENGTH(self);
11294 kind = PyUnicode_KIND(self);
11295 data = PyUnicode_DATA(self);
11296
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 if (length == 1)
11299 return PyBool_FromLong(
11300 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011302 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011304 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011305
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 for (i = 0; i < length; i++) {
11308 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011309
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11311 return PyBool_FromLong(0);
11312 else if (!cased && Py_UNICODE_ISUPPER(ch))
11313 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011315 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316}
11317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011318PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011321Return True if S is a titlecased string and there is at least one\n\
11322character in S, i.e. upper- and titlecase characters may only\n\
11323follow uncased characters and lowercase characters only cased ones.\n\
11324Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
11326static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011327unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 Py_ssize_t i, length;
11330 int kind;
11331 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332 int cased, previous_is_cased;
11333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 if (PyUnicode_READY(self) == -1)
11335 return NULL;
11336 length = PyUnicode_GET_LENGTH(self);
11337 kind = PyUnicode_KIND(self);
11338 data = PyUnicode_DATA(self);
11339
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 if (length == 1) {
11342 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11343 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11344 (Py_UNICODE_ISUPPER(ch) != 0));
11345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011347 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011349 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011350
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351 cased = 0;
11352 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 for (i = 0; i < length; i++) {
11354 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011355
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11357 if (previous_is_cased)
11358 return PyBool_FromLong(0);
11359 previous_is_cased = 1;
11360 cased = 1;
11361 }
11362 else if (Py_UNICODE_ISLOWER(ch)) {
11363 if (!previous_is_cased)
11364 return PyBool_FromLong(0);
11365 previous_is_cased = 1;
11366 cased = 1;
11367 }
11368 else
11369 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011371 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372}
11373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011374PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011377Return True if all characters in S are whitespace\n\
11378and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379
11380static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011381unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 Py_ssize_t i, length;
11384 int kind;
11385 void *data;
11386
11387 if (PyUnicode_READY(self) == -1)
11388 return NULL;
11389 length = PyUnicode_GET_LENGTH(self);
11390 kind = PyUnicode_KIND(self);
11391 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 if (length == 1)
11395 return PyBool_FromLong(
11396 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011398 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 for (i = 0; i < length; i++) {
11403 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011404 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011407 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408}
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011412\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011413Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011415
11416static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011417unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011418{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 Py_ssize_t i, length;
11420 int kind;
11421 void *data;
11422
11423 if (PyUnicode_READY(self) == -1)
11424 return NULL;
11425 length = PyUnicode_GET_LENGTH(self);
11426 kind = PyUnicode_KIND(self);
11427 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011428
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011429 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 if (length == 1)
11431 return PyBool_FromLong(
11432 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011433
11434 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 for (i = 0; i < length; i++) {
11439 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011441 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011442 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011443}
11444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011445PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011447\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011448Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011449and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011450
11451static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011452unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011453{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 int kind;
11455 void *data;
11456 Py_ssize_t len, i;
11457
11458 if (PyUnicode_READY(self) == -1)
11459 return NULL;
11460
11461 kind = PyUnicode_KIND(self);
11462 data = PyUnicode_DATA(self);
11463 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011464
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011465 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 if (len == 1) {
11467 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11468 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11469 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011470
11471 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 for (i = 0; i < len; i++) {
11476 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011477 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011479 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011480 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011481}
11482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011486Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011487False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
11489static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011490unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 Py_ssize_t i, length;
11493 int kind;
11494 void *data;
11495
11496 if (PyUnicode_READY(self) == -1)
11497 return NULL;
11498 length = PyUnicode_GET_LENGTH(self);
11499 kind = PyUnicode_KIND(self);
11500 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 if (length == 1)
11504 return PyBool_FromLong(
11505 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011507 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 for (i = 0; i < length; i++) {
11512 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011515 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516}
11517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011518PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011521Return True if all characters in S are digits\n\
11522and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523
11524static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011525unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 Py_ssize_t i, length;
11528 int kind;
11529 void *data;
11530
11531 if (PyUnicode_READY(self) == -1)
11532 return NULL;
11533 length = PyUnicode_GET_LENGTH(self);
11534 kind = PyUnicode_KIND(self);
11535 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 if (length == 1) {
11539 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11540 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011543 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 for (i = 0; i < length; i++) {
11548 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011551 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552}
11553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011554PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011557Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011558False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559
11560static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011561unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 Py_ssize_t i, length;
11564 int kind;
11565 void *data;
11566
11567 if (PyUnicode_READY(self) == -1)
11568 return NULL;
11569 length = PyUnicode_GET_LENGTH(self);
11570 kind = PyUnicode_KIND(self);
11571 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 if (length == 1)
11575 return PyBool_FromLong(
11576 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011578 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582 for (i = 0; i < length; i++) {
11583 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011586 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587}
11588
Martin v. Löwis47383402007-08-15 07:32:56 +000011589int
11590PyUnicode_IsIdentifier(PyObject *self)
11591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 int kind;
11593 void *data;
11594 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011595 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597 if (PyUnicode_READY(self) == -1) {
11598 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 }
11601
11602 /* Special case for empty strings */
11603 if (PyUnicode_GET_LENGTH(self) == 0)
11604 return 0;
11605 kind = PyUnicode_KIND(self);
11606 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011607
11608 /* PEP 3131 says that the first character must be in
11609 XID_Start and subsequent characters in XID_Continue,
11610 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011611 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011612 letters, digits, underscore). However, given the current
11613 definition of XID_Start and XID_Continue, it is sufficient
11614 to check just for these, except that _ must be allowed
11615 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011617 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011618 return 0;
11619
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011620 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011623 return 1;
11624}
11625
11626PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011628\n\
11629Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011630to the language definition.\n\
11631\n\
11632Use keyword.iskeyword() to test for reserved identifiers\n\
11633such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011634
11635static PyObject*
11636unicode_isidentifier(PyObject *self)
11637{
11638 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11639}
11640
Georg Brandl559e5d72008-06-11 18:37:52 +000011641PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011643\n\
11644Return True if all characters in S are considered\n\
11645printable in repr() or S is empty, False otherwise.");
11646
11647static PyObject*
11648unicode_isprintable(PyObject *self)
11649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 Py_ssize_t i, length;
11651 int kind;
11652 void *data;
11653
11654 if (PyUnicode_READY(self) == -1)
11655 return NULL;
11656 length = PyUnicode_GET_LENGTH(self);
11657 kind = PyUnicode_KIND(self);
11658 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011659
11660 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (length == 1)
11662 return PyBool_FromLong(
11663 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 for (i = 0; i < length; i++) {
11666 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011667 Py_RETURN_FALSE;
11668 }
11669 }
11670 Py_RETURN_TRUE;
11671}
11672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011673PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011674 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675\n\
11676Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011677iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678
11679static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011680unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011682 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683}
11684
Martin v. Löwis18e16552006-02-15 17:27:45 +000011685static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011686unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 if (PyUnicode_READY(self) == -1)
11689 return -1;
11690 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691}
11692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011693PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011694 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011696Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011697done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698
11699static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011700unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011702 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 Py_UCS4 fillchar = ' ';
11704
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011705 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706 return NULL;
11707
Benjamin Petersonbac79492012-01-14 13:34:47 -050011708 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710
Victor Stinnerc4b49542011-12-11 22:44:26 +010011711 if (PyUnicode_GET_LENGTH(self) >= width)
11712 return unicode_result_unchanged(self);
11713
11714 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715}
11716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011717PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011718 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011720Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
11722static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011723unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011725 if (PyUnicode_READY(self) == -1)
11726 return NULL;
11727 if (PyUnicode_IS_ASCII(self))
11728 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011729 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730}
11731
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011732#define LEFTSTRIP 0
11733#define RIGHTSTRIP 1
11734#define BOTHSTRIP 2
11735
11736/* Arrays indexed by above */
11737static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11738
11739#define STRIPNAME(i) (stripformat[i]+3)
11740
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011741/* externally visible for str.strip(unicode) */
11742PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011743_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 void *data;
11746 int kind;
11747 Py_ssize_t i, j, len;
11748 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011749 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11752 return NULL;
11753
11754 kind = PyUnicode_KIND(self);
11755 data = PyUnicode_DATA(self);
11756 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011757 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11759 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011760 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011761
Benjamin Peterson14339b62009-01-31 16:36:08 +000011762 i = 0;
11763 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011764 while (i < len) {
11765 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11766 if (!BLOOM(sepmask, ch))
11767 break;
11768 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11769 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 i++;
11771 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011772 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011773
Benjamin Peterson14339b62009-01-31 16:36:08 +000011774 j = len;
11775 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011776 j--;
11777 while (j >= i) {
11778 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11779 if (!BLOOM(sepmask, ch))
11780 break;
11781 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11782 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011784 }
11785
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011787 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011788
Victor Stinner7931d9a2011-11-04 00:22:48 +010011789 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790}
11791
11792PyObject*
11793PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11794{
11795 unsigned char *data;
11796 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011797 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798
Victor Stinnerde636f32011-10-01 03:55:54 +020011799 if (PyUnicode_READY(self) == -1)
11800 return NULL;
11801
Victor Stinner684d5fd2012-05-03 02:32:34 +020011802 length = PyUnicode_GET_LENGTH(self);
11803 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011804
Victor Stinner684d5fd2012-05-03 02:32:34 +020011805 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011806 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807
Victor Stinnerde636f32011-10-01 03:55:54 +020011808 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011809 PyErr_SetString(PyExc_IndexError, "string index out of range");
11810 return NULL;
11811 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011812 if (start >= length || end < start)
11813 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011814
Victor Stinner684d5fd2012-05-03 02:32:34 +020011815 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011816 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011817 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011818 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011819 }
11820 else {
11821 kind = PyUnicode_KIND(self);
11822 data = PyUnicode_1BYTE_DATA(self);
11823 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011824 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011825 length);
11826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828
11829static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011830do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 Py_ssize_t len, i, j;
11833
11834 if (PyUnicode_READY(self) == -1)
11835 return NULL;
11836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011838
Victor Stinnercc7af722013-04-09 22:39:24 +020011839 if (PyUnicode_IS_ASCII(self)) {
11840 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11841
11842 i = 0;
11843 if (striptype != RIGHTSTRIP) {
11844 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011845 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020011846 if (!_Py_ascii_whitespace[ch])
11847 break;
11848 i++;
11849 }
11850 }
11851
11852 j = len;
11853 if (striptype != LEFTSTRIP) {
11854 j--;
11855 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011856 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020011857 if (!_Py_ascii_whitespace[ch])
11858 break;
11859 j--;
11860 }
11861 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011862 }
11863 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011864 else {
11865 int kind = PyUnicode_KIND(self);
11866 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011867
Victor Stinnercc7af722013-04-09 22:39:24 +020011868 i = 0;
11869 if (striptype != RIGHTSTRIP) {
11870 while (i < len) {
11871 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11872 if (!Py_UNICODE_ISSPACE(ch))
11873 break;
11874 i++;
11875 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011876 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011877
11878 j = len;
11879 if (striptype != LEFTSTRIP) {
11880 j--;
11881 while (j >= i) {
11882 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11883 if (!Py_UNICODE_ISSPACE(ch))
11884 break;
11885 j--;
11886 }
11887 j++;
11888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011889 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011890
Victor Stinner7931d9a2011-11-04 00:22:48 +010011891 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892}
11893
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011894
11895static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011896do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011897{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011898 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011899
Serhiy Storchakac6792272013-10-19 21:03:34 +030011900 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011901 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011902
Benjamin Peterson14339b62009-01-31 16:36:08 +000011903 if (sep != NULL && sep != Py_None) {
11904 if (PyUnicode_Check(sep))
11905 return _PyUnicode_XStrip(self, striptype, sep);
11906 else {
11907 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 "%s arg must be None or str",
11909 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011910 return NULL;
11911 }
11912 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011913
Benjamin Peterson14339b62009-01-31 16:36:08 +000011914 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011915}
11916
11917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011918PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011920\n\
11921Return a copy of the string S with leading and trailing\n\
11922whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011923If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011924
11925static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011926unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011927{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011928 if (PyTuple_GET_SIZE(args) == 0)
11929 return do_strip(self, BOTHSTRIP); /* Common case */
11930 else
11931 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011932}
11933
11934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011935PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011937\n\
11938Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011939If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011940
11941static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011942unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011943{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011944 if (PyTuple_GET_SIZE(args) == 0)
11945 return do_strip(self, LEFTSTRIP); /* Common case */
11946 else
11947 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011948}
11949
11950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011951PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011953\n\
11954Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011955If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011956
11957static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011958unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011959{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011960 if (PyTuple_GET_SIZE(args) == 0)
11961 return do_strip(self, RIGHTSTRIP); /* Common case */
11962 else
11963 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011964}
11965
11966
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011968unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011970 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
Serhiy Storchaka05997252013-01-26 12:14:02 +020011973 if (len < 1)
11974 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
Victor Stinnerc4b49542011-12-11 22:44:26 +010011976 /* no repeat, return original string */
11977 if (len == 1)
11978 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011979
Benjamin Petersonbac79492012-01-14 13:34:47 -050011980 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 return NULL;
11982
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011983 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011984 PyErr_SetString(PyExc_OverflowError,
11985 "repeated string is too long");
11986 return NULL;
11987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011989
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011990 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991 if (!u)
11992 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011993 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (PyUnicode_GET_LENGTH(str) == 1) {
11996 const int kind = PyUnicode_KIND(str);
11997 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011998 if (kind == PyUnicode_1BYTE_KIND) {
11999 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012000 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012001 }
12002 else if (kind == PyUnicode_2BYTE_KIND) {
12003 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012004 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012005 ucs2[n] = fill_char;
12006 } else {
12007 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12008 assert(kind == PyUnicode_4BYTE_KIND);
12009 for (n = 0; n < len; ++n)
12010 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 }
12013 else {
12014 /* number of characters copied this far */
12015 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012016 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 char *to = (char *) PyUnicode_DATA(u);
12018 Py_MEMCPY(to, PyUnicode_DATA(str),
12019 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 n = (done <= nchars-done) ? done : nchars-done;
12022 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012023 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025 }
12026
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012027 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012028 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029}
12030
Alexander Belopolsky40018472011-02-26 01:02:56 +000012031PyObject *
12032PyUnicode_Replace(PyObject *obj,
12033 PyObject *subobj,
12034 PyObject *replobj,
12035 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036{
12037 PyObject *self;
12038 PyObject *str1;
12039 PyObject *str2;
12040 PyObject *result;
12041
12042 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012043 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012046 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 Py_DECREF(self);
12048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 }
12050 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012051 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 Py_DECREF(self);
12053 Py_DECREF(str1);
12054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012056 if (PyUnicode_READY(self) == -1 ||
12057 PyUnicode_READY(str1) == -1 ||
12058 PyUnicode_READY(str2) == -1)
12059 result = NULL;
12060 else
12061 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 Py_DECREF(self);
12063 Py_DECREF(str1);
12064 Py_DECREF(str2);
12065 return result;
12066}
12067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012068PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012069 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070\n\
12071Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012072old replaced by new. If the optional argument count is\n\
12073given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074
12075static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 PyObject *str1;
12079 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012080 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 PyObject *result;
12082
Martin v. Löwis18e16552006-02-15 17:27:45 +000012083 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012085 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012086 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012088 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 return NULL;
12090 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012091 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 Py_DECREF(str1);
12093 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012094 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012095 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12096 result = NULL;
12097 else
12098 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099
12100 Py_DECREF(str1);
12101 Py_DECREF(str2);
12102 return result;
12103}
12104
Alexander Belopolsky40018472011-02-26 01:02:56 +000012105static PyObject *
12106unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012108 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 Py_ssize_t isize;
12110 Py_ssize_t osize, squote, dquote, i, o;
12111 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012112 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012116 return NULL;
12117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 isize = PyUnicode_GET_LENGTH(unicode);
12119 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 /* Compute length of output, quote characters, and
12122 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012123 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 max = 127;
12125 squote = dquote = 0;
12126 ikind = PyUnicode_KIND(unicode);
12127 for (i = 0; i < isize; i++) {
12128 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12129 switch (ch) {
12130 case '\'': squote++; osize++; break;
12131 case '"': dquote++; osize++; break;
12132 case '\\': case '\t': case '\r': case '\n':
12133 osize += 2; break;
12134 default:
12135 /* Fast-path ASCII */
12136 if (ch < ' ' || ch == 0x7f)
12137 osize += 4; /* \xHH */
12138 else if (ch < 0x7f)
12139 osize++;
12140 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12141 osize++;
12142 max = ch > max ? ch : max;
12143 }
12144 else if (ch < 0x100)
12145 osize += 4; /* \xHH */
12146 else if (ch < 0x10000)
12147 osize += 6; /* \uHHHH */
12148 else
12149 osize += 10; /* \uHHHHHHHH */
12150 }
12151 }
12152
12153 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012154 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012156 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (dquote)
12158 /* Both squote and dquote present. Use squote,
12159 and escape them */
12160 osize += squote;
12161 else
12162 quote = '"';
12163 }
Victor Stinner55c08782013-04-14 18:45:39 +020012164 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165
12166 repr = PyUnicode_New(osize, max);
12167 if (repr == NULL)
12168 return NULL;
12169 okind = PyUnicode_KIND(repr);
12170 odata = PyUnicode_DATA(repr);
12171
12172 PyUnicode_WRITE(okind, odata, 0, quote);
12173 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012174 if (unchanged) {
12175 _PyUnicode_FastCopyCharacters(repr, 1,
12176 unicode, 0,
12177 isize);
12178 }
12179 else {
12180 for (i = 0, o = 1; i < isize; i++) {
12181 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182
Victor Stinner55c08782013-04-14 18:45:39 +020012183 /* Escape quotes and backslashes */
12184 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012185 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012187 continue;
12188 }
12189
12190 /* Map special whitespace to '\t', \n', '\r' */
12191 if (ch == '\t') {
12192 PyUnicode_WRITE(okind, odata, o++, '\\');
12193 PyUnicode_WRITE(okind, odata, o++, 't');
12194 }
12195 else if (ch == '\n') {
12196 PyUnicode_WRITE(okind, odata, o++, '\\');
12197 PyUnicode_WRITE(okind, odata, o++, 'n');
12198 }
12199 else if (ch == '\r') {
12200 PyUnicode_WRITE(okind, odata, o++, '\\');
12201 PyUnicode_WRITE(okind, odata, o++, 'r');
12202 }
12203
12204 /* Map non-printable US ASCII to '\xhh' */
12205 else if (ch < ' ' || ch == 0x7F) {
12206 PyUnicode_WRITE(okind, odata, o++, '\\');
12207 PyUnicode_WRITE(okind, odata, o++, 'x');
12208 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12209 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12210 }
12211
12212 /* Copy ASCII characters as-is */
12213 else if (ch < 0x7F) {
12214 PyUnicode_WRITE(okind, odata, o++, ch);
12215 }
12216
12217 /* Non-ASCII characters */
12218 else {
12219 /* Map Unicode whitespace and control characters
12220 (categories Z* and C* except ASCII space)
12221 */
12222 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12223 PyUnicode_WRITE(okind, odata, o++, '\\');
12224 /* Map 8-bit characters to '\xhh' */
12225 if (ch <= 0xff) {
12226 PyUnicode_WRITE(okind, odata, o++, 'x');
12227 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12228 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12229 }
12230 /* Map 16-bit characters to '\uxxxx' */
12231 else if (ch <= 0xffff) {
12232 PyUnicode_WRITE(okind, odata, o++, 'u');
12233 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12234 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12235 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12236 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12237 }
12238 /* Map 21-bit characters to '\U00xxxxxx' */
12239 else {
12240 PyUnicode_WRITE(okind, odata, o++, 'U');
12241 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12242 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12243 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12244 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12245 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12246 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12247 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12248 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12249 }
12250 }
12251 /* Copy characters as-is */
12252 else {
12253 PyUnicode_WRITE(okind, odata, o++, ch);
12254 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012255 }
12256 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012259 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012260 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261}
12262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012263PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012264 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265\n\
12266Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012267such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268arguments start and end are interpreted as in slice notation.\n\
12269\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012270Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271
12272static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012275 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012276 Py_ssize_t start;
12277 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012278 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279
Jesus Ceaac451502011-04-20 17:09:23 +020012280 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12281 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
Christian Heimesea71a522013-06-29 21:17:34 +020012284 if (PyUnicode_READY(self) == -1) {
12285 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012287 }
12288 if (PyUnicode_READY(substring) == -1) {
12289 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292
Victor Stinner7931d9a2011-11-04 00:22:48 +010012293 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294
12295 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 if (result == -2)
12298 return NULL;
12299
Christian Heimes217cfd12007-12-02 14:31:20 +000012300 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301}
12302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012303PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012306Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
12308static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012311 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012312 Py_ssize_t start;
12313 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012314 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315
Jesus Ceaac451502011-04-20 17:09:23 +020012316 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12317 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012318 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319
Christian Heimesea71a522013-06-29 21:17:34 +020012320 if (PyUnicode_READY(self) == -1) {
12321 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012323 }
12324 if (PyUnicode_READY(substring) == -1) {
12325 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328
Victor Stinner7931d9a2011-11-04 00:22:48 +010012329 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330
12331 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 if (result == -2)
12334 return NULL;
12335
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336 if (result < 0) {
12337 PyErr_SetString(PyExc_ValueError, "substring not found");
12338 return NULL;
12339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340
Christian Heimes217cfd12007-12-02 14:31:20 +000012341 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342}
12343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012344PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012347Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012348done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349
12350static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012351unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012353 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 Py_UCS4 fillchar = ' ';
12355
Victor Stinnere9a29352011-10-01 02:14:59 +020012356 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012358
Benjamin Petersonbac79492012-01-14 13:34:47 -050012359 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360 return NULL;
12361
Victor Stinnerc4b49542011-12-11 22:44:26 +010012362 if (PyUnicode_GET_LENGTH(self) >= width)
12363 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364
Victor Stinnerc4b49542011-12-11 22:44:26 +010012365 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366}
12367
Alexander Belopolsky40018472011-02-26 01:02:56 +000012368PyObject *
12369PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370{
12371 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012372
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373 s = PyUnicode_FromObject(s);
12374 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012375 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 if (sep != NULL) {
12377 sep = PyUnicode_FromObject(sep);
12378 if (sep == NULL) {
12379 Py_DECREF(s);
12380 return NULL;
12381 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 }
12383
Victor Stinner9310abb2011-10-05 00:59:23 +020012384 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385
12386 Py_DECREF(s);
12387 Py_XDECREF(sep);
12388 return result;
12389}
12390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012391PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012392 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393\n\
12394Return a list of the words in S, using sep as the\n\
12395delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012396splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012397whitespace string is a separator and empty strings are\n\
12398removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399
12400static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012401unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012403 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012405 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012407 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12408 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409 return NULL;
12410
12411 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012412 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012414 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012416 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417}
12418
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419PyObject *
12420PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12421{
12422 PyObject* str_obj;
12423 PyObject* sep_obj;
12424 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 int kind1, kind2, kind;
12426 void *buf1 = NULL, *buf2 = NULL;
12427 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012428
12429 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012430 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012431 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012432 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012433 if (!sep_obj) {
12434 Py_DECREF(str_obj);
12435 return NULL;
12436 }
12437 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12438 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012439 Py_DECREF(str_obj);
12440 return NULL;
12441 }
12442
Victor Stinner14f8f022011-10-05 20:58:25 +020012443 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012445 kind = Py_MAX(kind1, kind2);
12446 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012448 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 if (!buf1)
12450 goto onError;
12451 buf2 = PyUnicode_DATA(sep_obj);
12452 if (kind2 != kind)
12453 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12454 if (!buf2)
12455 goto onError;
12456 len1 = PyUnicode_GET_LENGTH(str_obj);
12457 len2 = PyUnicode_GET_LENGTH(sep_obj);
12458
Benjamin Petersonead6b532011-12-20 17:23:42 -060012459 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012461 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12462 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12463 else
12464 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 break;
12466 case PyUnicode_2BYTE_KIND:
12467 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12468 break;
12469 case PyUnicode_4BYTE_KIND:
12470 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12471 break;
12472 default:
12473 assert(0);
12474 out = 0;
12475 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012476
12477 Py_DECREF(sep_obj);
12478 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 if (kind1 != kind)
12480 PyMem_Free(buf1);
12481 if (kind2 != kind)
12482 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012483
12484 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 onError:
12486 Py_DECREF(sep_obj);
12487 Py_DECREF(str_obj);
12488 if (kind1 != kind && buf1)
12489 PyMem_Free(buf1);
12490 if (kind2 != kind && buf2)
12491 PyMem_Free(buf2);
12492 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012493}
12494
12495
12496PyObject *
12497PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12498{
12499 PyObject* str_obj;
12500 PyObject* sep_obj;
12501 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 int kind1, kind2, kind;
12503 void *buf1 = NULL, *buf2 = NULL;
12504 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012505
12506 str_obj = PyUnicode_FromObject(str_in);
12507 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012508 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012509 sep_obj = PyUnicode_FromObject(sep_in);
12510 if (!sep_obj) {
12511 Py_DECREF(str_obj);
12512 return NULL;
12513 }
12514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 kind1 = PyUnicode_KIND(str_in);
12516 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012517 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 buf1 = PyUnicode_DATA(str_in);
12519 if (kind1 != kind)
12520 buf1 = _PyUnicode_AsKind(str_in, kind);
12521 if (!buf1)
12522 goto onError;
12523 buf2 = PyUnicode_DATA(sep_obj);
12524 if (kind2 != kind)
12525 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12526 if (!buf2)
12527 goto onError;
12528 len1 = PyUnicode_GET_LENGTH(str_obj);
12529 len2 = PyUnicode_GET_LENGTH(sep_obj);
12530
Benjamin Petersonead6b532011-12-20 17:23:42 -060012531 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012533 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12534 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12535 else
12536 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 break;
12538 case PyUnicode_2BYTE_KIND:
12539 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12540 break;
12541 case PyUnicode_4BYTE_KIND:
12542 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12543 break;
12544 default:
12545 assert(0);
12546 out = 0;
12547 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012548
12549 Py_DECREF(sep_obj);
12550 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 if (kind1 != kind)
12552 PyMem_Free(buf1);
12553 if (kind2 != kind)
12554 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012555
12556 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 onError:
12558 Py_DECREF(sep_obj);
12559 Py_DECREF(str_obj);
12560 if (kind1 != kind && buf1)
12561 PyMem_Free(buf1);
12562 if (kind2 != kind && buf2)
12563 PyMem_Free(buf2);
12564 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012565}
12566
12567PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012568 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012569\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012570Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012571the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012572found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012573
12574static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012575unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012576{
Victor Stinner9310abb2011-10-05 00:59:23 +020012577 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012578}
12579
12580PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012581 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012582\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012583Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012584the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012585separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012586
12587static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012588unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012589{
Victor Stinner9310abb2011-10-05 00:59:23 +020012590 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012591}
12592
Alexander Belopolsky40018472011-02-26 01:02:56 +000012593PyObject *
12594PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012595{
12596 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012597
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012598 s = PyUnicode_FromObject(s);
12599 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012600 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 if (sep != NULL) {
12602 sep = PyUnicode_FromObject(sep);
12603 if (sep == NULL) {
12604 Py_DECREF(s);
12605 return NULL;
12606 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012607 }
12608
Victor Stinner9310abb2011-10-05 00:59:23 +020012609 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012610
12611 Py_DECREF(s);
12612 Py_XDECREF(sep);
12613 return result;
12614}
12615
12616PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012617 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012618\n\
12619Return a list of the words in S, using sep as the\n\
12620delimiter string, starting at the end of the string and\n\
12621working to the front. If maxsplit is given, at most maxsplit\n\
12622splits are done. If sep is not specified, any whitespace string\n\
12623is a separator.");
12624
12625static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012626unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012627{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012628 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012629 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012630 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012631
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012632 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12633 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012634 return NULL;
12635
12636 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012638 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012639 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012640 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012641 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012642}
12643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012644PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646\n\
12647Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012648Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012649is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
12651static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012652unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012654 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012655 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012657 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12658 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659 return NULL;
12660
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012661 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662}
12663
12664static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012665PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012667 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668}
12669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012670PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672\n\
12673Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012674and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
12676static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012677unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012679 if (PyUnicode_READY(self) == -1)
12680 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012681 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682}
12683
Larry Hastings31826802013-10-19 00:09:25 -070012684/*[clinic]
12685module str
Georg Brandlceee0772007-11-27 23:48:05 +000012686
Larry Hastings31826802013-10-19 00:09:25 -070012687@staticmethod
12688str.maketrans as unicode_maketrans
12689
12690 x: object
12691
12692 y: unicode=NULL
12693
12694 z: unicode=NULL
12695
12696 /
12697
12698Return a translation table usable for str.translate().
12699
12700If there is only one argument, it must be a dictionary mapping Unicode
12701ordinals (integers) or characters to Unicode ordinals, strings or None.
12702Character keys will be then converted to ordinals.
12703If there are two arguments, they must be strings of equal length, and
12704in the resulting dictionary, each character in x will be mapped to the
12705character at the same position in y. If there is a third argument, it
12706must be a string, whose characters will be mapped to None in the result.
12707[clinic]*/
12708
12709PyDoc_STRVAR(unicode_maketrans__doc__,
12710"Return a translation table usable for str.translate().\n"
12711"\n"
12712"str.maketrans(x, y=None, z=None)\n"
12713"\n"
12714"If there is only one argument, it must be a dictionary mapping Unicode\n"
12715"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12716"Character keys will be then converted to ordinals.\n"
12717"If there are two arguments, they must be strings of equal length, and\n"
12718"in the resulting dictionary, each character in x will be mapped to the\n"
12719"character at the same position in y. If there is a third argument, it\n"
12720"must be a string, whose characters will be mapped to None in the result.");
12721
12722#define UNICODE_MAKETRANS_METHODDEF \
12723 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12724
12725static PyObject *
12726unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
12727
12728static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012729unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012730{
Larry Hastings31826802013-10-19 00:09:25 -070012731 PyObject *return_value = NULL;
12732 PyObject *x;
12733 PyObject *y = NULL;
12734 PyObject *z = NULL;
12735
12736 if (!PyArg_ParseTuple(args,
12737 "O|UU:maketrans",
12738 &x, &y, &z))
12739 goto exit;
12740 return_value = unicode_maketrans_impl(x, y, z);
12741
12742exit:
12743 return return_value;
12744}
12745
12746static PyObject *
12747unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12748/*[clinic checksum: 137db9c3199e7906b7967009f511c24fa3235b5f]*/
12749{
Georg Brandlceee0772007-11-27 23:48:05 +000012750 PyObject *new = NULL, *key, *value;
12751 Py_ssize_t i = 0;
12752 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012753
Georg Brandlceee0772007-11-27 23:48:05 +000012754 new = PyDict_New();
12755 if (!new)
12756 return NULL;
12757 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 int x_kind, y_kind, z_kind;
12759 void *x_data, *y_data, *z_data;
12760
Georg Brandlceee0772007-11-27 23:48:05 +000012761 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012762 if (!PyUnicode_Check(x)) {
12763 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12764 "be a string if there is a second argument");
12765 goto err;
12766 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012768 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12769 "arguments must have equal length");
12770 goto err;
12771 }
12772 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 x_kind = PyUnicode_KIND(x);
12774 y_kind = PyUnicode_KIND(y);
12775 x_data = PyUnicode_DATA(x);
12776 y_data = PyUnicode_DATA(y);
12777 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12778 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012779 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012780 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012781 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012782 if (!value) {
12783 Py_DECREF(key);
12784 goto err;
12785 }
Georg Brandlceee0772007-11-27 23:48:05 +000012786 res = PyDict_SetItem(new, key, value);
12787 Py_DECREF(key);
12788 Py_DECREF(value);
12789 if (res < 0)
12790 goto err;
12791 }
12792 /* create entries for deleting chars in z */
12793 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794 z_kind = PyUnicode_KIND(z);
12795 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012796 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012798 if (!key)
12799 goto err;
12800 res = PyDict_SetItem(new, key, Py_None);
12801 Py_DECREF(key);
12802 if (res < 0)
12803 goto err;
12804 }
12805 }
12806 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 int kind;
12808 void *data;
12809
Georg Brandlceee0772007-11-27 23:48:05 +000012810 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012811 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012812 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12813 "to maketrans it must be a dict");
12814 goto err;
12815 }
12816 /* copy entries into the new dict, converting string keys to int keys */
12817 while (PyDict_Next(x, &i, &key, &value)) {
12818 if (PyUnicode_Check(key)) {
12819 /* convert string keys to integer keys */
12820 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012821 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012822 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12823 "table must be of length 1");
12824 goto err;
12825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826 kind = PyUnicode_KIND(key);
12827 data = PyUnicode_DATA(key);
12828 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012829 if (!newkey)
12830 goto err;
12831 res = PyDict_SetItem(new, newkey, value);
12832 Py_DECREF(newkey);
12833 if (res < 0)
12834 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012835 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012836 /* just keep integer keys */
12837 if (PyDict_SetItem(new, key, value) < 0)
12838 goto err;
12839 } else {
12840 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12841 "be strings or integers");
12842 goto err;
12843 }
12844 }
12845 }
12846 return new;
12847 err:
12848 Py_DECREF(new);
12849 return NULL;
12850}
12851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012852PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854\n\
12855Return a copy of the string S, where all characters have been mapped\n\
12856through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012857Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012858Unmapped characters are left untouched. Characters mapped to None\n\
12859are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860
12861static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012862unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012864 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865}
12866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012867PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012870Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871
12872static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012873unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012875 if (PyUnicode_READY(self) == -1)
12876 return NULL;
12877 if (PyUnicode_IS_ASCII(self))
12878 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012879 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880}
12881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012882PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012883 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012885Pad a numeric string S with zeros on the left, to fill a field\n\
12886of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887
12888static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012889unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012891 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012892 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012893 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 int kind;
12895 void *data;
12896 Py_UCS4 chr;
12897
Martin v. Löwis18e16552006-02-15 17:27:45 +000012898 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899 return NULL;
12900
Benjamin Petersonbac79492012-01-14 13:34:47 -050012901 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903
Victor Stinnerc4b49542011-12-11 22:44:26 +010012904 if (PyUnicode_GET_LENGTH(self) >= width)
12905 return unicode_result_unchanged(self);
12906
12907 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908
12909 u = pad(self, fill, 0, '0');
12910
Walter Dörwald068325e2002-04-15 13:36:47 +000012911 if (u == NULL)
12912 return NULL;
12913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 kind = PyUnicode_KIND(u);
12915 data = PyUnicode_DATA(u);
12916 chr = PyUnicode_READ(kind, data, fill);
12917
12918 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 PyUnicode_WRITE(kind, data, 0, chr);
12921 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922 }
12923
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012924 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012925 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927
12928#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012929static PyObject *
12930unicode__decimal2ascii(PyObject *self)
12931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012933}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012934#endif
12935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012936PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012937 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012939Return True if S starts with the specified prefix, False otherwise.\n\
12940With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012941With optional end, stop comparing S at that position.\n\
12942prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943
12944static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012945unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012946 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012948 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012949 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012950 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012951 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012952 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953
Jesus Ceaac451502011-04-20 17:09:23 +020012954 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012955 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012956 if (PyTuple_Check(subobj)) {
12957 Py_ssize_t i;
12958 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012959 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012960 if (substring == NULL)
12961 return NULL;
12962 result = tailmatch(self, substring, start, end, -1);
12963 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012964 if (result == -1)
12965 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012966 if (result) {
12967 Py_RETURN_TRUE;
12968 }
12969 }
12970 /* nothing matched */
12971 Py_RETURN_FALSE;
12972 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012973 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012974 if (substring == NULL) {
12975 if (PyErr_ExceptionMatches(PyExc_TypeError))
12976 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12977 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012978 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012979 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012980 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012981 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012982 if (result == -1)
12983 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012984 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012985}
12986
12987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012988PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012991Return True if S ends with the specified suffix, False otherwise.\n\
12992With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012993With optional end, stop comparing S at that position.\n\
12994suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995
12996static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012997unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013000 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013001 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013002 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013003 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013004 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005
Jesus Ceaac451502011-04-20 17:09:23 +020013006 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013007 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013008 if (PyTuple_Check(subobj)) {
13009 Py_ssize_t i;
13010 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013011 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013013 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013014 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013015 result = tailmatch(self, substring, start, end, +1);
13016 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013017 if (result == -1)
13018 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013019 if (result) {
13020 Py_RETURN_TRUE;
13021 }
13022 }
13023 Py_RETURN_FALSE;
13024 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013025 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013026 if (substring == NULL) {
13027 if (PyErr_ExceptionMatches(PyExc_TypeError))
13028 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13029 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013030 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013031 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013032 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013033 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013034 if (result == -1)
13035 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013036 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013037}
13038
Victor Stinner202fdca2012-05-07 12:47:02 +020013039Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013040_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013041{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013042 if (!writer->readonly)
13043 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13044 else {
13045 /* Copy-on-write mode: set buffer size to 0 so
13046 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13047 * next write. */
13048 writer->size = 0;
13049 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013050 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13051 writer->data = PyUnicode_DATA(writer->buffer);
13052 writer->kind = PyUnicode_KIND(writer->buffer);
13053}
13054
Victor Stinnerd3f08822012-05-29 12:57:52 +020013055void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013056_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013057{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013058 memset(writer, 0, sizeof(*writer));
13059#ifdef Py_DEBUG
13060 writer->kind = 5; /* invalid kind */
13061#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013062 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013063}
13064
Victor Stinnerd3f08822012-05-29 12:57:52 +020013065int
13066_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13067 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013068{
13069 Py_ssize_t newlen;
13070 PyObject *newbuffer;
13071
Victor Stinnerd3f08822012-05-29 12:57:52 +020013072 assert(length > 0);
13073
Victor Stinner202fdca2012-05-07 12:47:02 +020013074 if (length > PY_SSIZE_T_MAX - writer->pos) {
13075 PyErr_NoMemory();
13076 return -1;
13077 }
13078 newlen = writer->pos + length;
13079
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013080 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013081
Victor Stinnerd3f08822012-05-29 12:57:52 +020013082 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013083 assert(!writer->readonly);
13084 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013085 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013086 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013087 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013088 if (newlen < writer->min_length)
13089 newlen = writer->min_length;
13090
Victor Stinnerd3f08822012-05-29 12:57:52 +020013091 writer->buffer = PyUnicode_New(newlen, maxchar);
13092 if (writer->buffer == NULL)
13093 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013094 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013095 else if (newlen > writer->size) {
13096 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013097 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013098 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013099 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013100 if (newlen < writer->min_length)
13101 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013102
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013103 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013104 /* resize + widen */
13105 newbuffer = PyUnicode_New(newlen, maxchar);
13106 if (newbuffer == NULL)
13107 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013108 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13109 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013110 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013111 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013112 }
13113 else {
13114 newbuffer = resize_compact(writer->buffer, newlen);
13115 if (newbuffer == NULL)
13116 return -1;
13117 }
13118 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013119 }
13120 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013121 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013122 newbuffer = PyUnicode_New(writer->size, maxchar);
13123 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013124 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013125 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13126 writer->buffer, 0, writer->pos);
13127 Py_DECREF(writer->buffer);
13128 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013129 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013130 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013131 return 0;
13132}
13133
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013134Py_LOCAL_INLINE(int)
13135_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013136{
13137 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13138 return -1;
13139 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13140 writer->pos++;
13141 return 0;
13142}
13143
13144int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013145_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13146{
13147 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13148}
13149
13150int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013151_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13152{
13153 Py_UCS4 maxchar;
13154 Py_ssize_t len;
13155
13156 if (PyUnicode_READY(str) == -1)
13157 return -1;
13158 len = PyUnicode_GET_LENGTH(str);
13159 if (len == 0)
13160 return 0;
13161 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13162 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013163 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013164 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013165 Py_INCREF(str);
13166 writer->buffer = str;
13167 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013168 writer->pos += len;
13169 return 0;
13170 }
13171 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13172 return -1;
13173 }
13174 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13175 str, 0, len);
13176 writer->pos += len;
13177 return 0;
13178}
13179
Victor Stinnere215d962012-10-06 23:03:36 +020013180int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013181_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13182 Py_ssize_t start, Py_ssize_t end)
13183{
13184 Py_UCS4 maxchar;
13185 Py_ssize_t len;
13186
13187 if (PyUnicode_READY(str) == -1)
13188 return -1;
13189
13190 assert(0 <= start);
13191 assert(end <= PyUnicode_GET_LENGTH(str));
13192 assert(start <= end);
13193
13194 if (end == 0)
13195 return 0;
13196
13197 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13198 return _PyUnicodeWriter_WriteStr(writer, str);
13199
13200 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13201 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13202 else
13203 maxchar = writer->maxchar;
13204 len = end - start;
13205
13206 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13207 return -1;
13208
13209 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13210 str, start, len);
13211 writer->pos += len;
13212 return 0;
13213}
13214
13215int
Victor Stinnere215d962012-10-06 23:03:36 +020013216_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13217{
13218 Py_UCS4 maxchar;
13219
13220 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13221 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13222 return -1;
13223 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13224 writer->pos += len;
13225 return 0;
13226}
13227
Victor Stinnerd3f08822012-05-29 12:57:52 +020013228PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013229_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013230{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013231 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013232 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013233 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013234 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013235 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013236 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013237 str = writer->buffer;
13238 writer->buffer = NULL;
13239 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13240 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013241 }
13242 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13243 PyObject *newbuffer;
13244 newbuffer = resize_compact(writer->buffer, writer->pos);
13245 if (newbuffer == NULL) {
13246 Py_DECREF(writer->buffer);
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013247 writer->buffer = NULL;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013248 return NULL;
13249 }
13250 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013251 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013252 str = writer->buffer;
13253 writer->buffer = NULL;
13254 assert(_PyUnicode_CheckConsistency(str, 1));
13255 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013256}
13257
Victor Stinnerd3f08822012-05-29 12:57:52 +020013258void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013259_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013260{
13261 Py_CLEAR(writer->buffer);
13262}
13263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013265
13266PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013267 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013268\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013269Return a formatted version of S, using substitutions from args and kwargs.\n\
13270The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013271
Eric Smith27bbca62010-11-04 17:06:58 +000013272PyDoc_STRVAR(format_map__doc__,
13273 "S.format_map(mapping) -> str\n\
13274\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013275Return a formatted version of S, using substitutions from mapping.\n\
13276The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013277
Eric Smith4a7d76d2008-05-30 18:10:19 +000013278static PyObject *
13279unicode__format__(PyObject* self, PyObject* args)
13280{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013281 PyObject *format_spec;
13282 _PyUnicodeWriter writer;
13283 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013284
13285 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13286 return NULL;
13287
Victor Stinnerd3f08822012-05-29 12:57:52 +020013288 if (PyUnicode_READY(self) == -1)
13289 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013290 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13292 self, format_spec, 0,
13293 PyUnicode_GET_LENGTH(format_spec));
13294 if (ret == -1) {
13295 _PyUnicodeWriter_Dealloc(&writer);
13296 return NULL;
13297 }
13298 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013299}
13300
Eric Smith8c663262007-08-25 02:26:07 +000013301PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013302 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013303\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013304Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013305
13306static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013307unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 Py_ssize_t size;
13310
13311 /* If it's a compact object, account for base structure +
13312 character data. */
13313 if (PyUnicode_IS_COMPACT_ASCII(v))
13314 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13315 else if (PyUnicode_IS_COMPACT(v))
13316 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013317 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 else {
13319 /* If it is a two-block object, account for base object, and
13320 for character block if present. */
13321 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013322 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013323 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013324 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325 }
13326 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013327 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013328 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013330 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013331 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332
13333 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013334}
13335
13336PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013337 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013338
13339static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013340unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013341{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013342 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013343 if (!copy)
13344 return NULL;
13345 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013346}
13347
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013349 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013350 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013351 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13352 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013353 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13354 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013355 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013356 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13357 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13358 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13359 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13360 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013361 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013362 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13363 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13364 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013365 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013366 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13367 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13368 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013369 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013370 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013371 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013372 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013373 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13374 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13375 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13376 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13377 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13378 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13379 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13380 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13381 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13382 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13383 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13384 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13385 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13386 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013387 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013388 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013389 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013390 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013391 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013392 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013393 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013394 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013395#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013396 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013397 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398#endif
13399
Benjamin Peterson14339b62009-01-31 16:36:08 +000013400 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013401 {NULL, NULL}
13402};
13403
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013404static PyObject *
13405unicode_mod(PyObject *v, PyObject *w)
13406{
Brian Curtindfc80e32011-08-10 20:28:54 -050013407 if (!PyUnicode_Check(v))
13408 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013409 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013410}
13411
13412static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013413 0, /*nb_add*/
13414 0, /*nb_subtract*/
13415 0, /*nb_multiply*/
13416 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013417};
13418
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013420 (lenfunc) unicode_length, /* sq_length */
13421 PyUnicode_Concat, /* sq_concat */
13422 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13423 (ssizeargfunc) unicode_getitem, /* sq_item */
13424 0, /* sq_slice */
13425 0, /* sq_ass_item */
13426 0, /* sq_ass_slice */
13427 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428};
13429
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013430static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013431unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013433 if (PyUnicode_READY(self) == -1)
13434 return NULL;
13435
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013436 if (PyIndex_Check(item)) {
13437 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013438 if (i == -1 && PyErr_Occurred())
13439 return NULL;
13440 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013441 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013442 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013443 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013444 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013445 PyObject *result;
13446 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013447 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013448 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013452 return NULL;
13453 }
13454
13455 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013456 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013457 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013458 slicelength == PyUnicode_GET_LENGTH(self)) {
13459 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013460 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013461 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013462 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013463 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013464 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013465 src_kind = PyUnicode_KIND(self);
13466 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013467 if (!PyUnicode_IS_ASCII(self)) {
13468 kind_limit = kind_maxchar_limit(src_kind);
13469 max_char = 0;
13470 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13471 ch = PyUnicode_READ(src_kind, src_data, cur);
13472 if (ch > max_char) {
13473 max_char = ch;
13474 if (max_char >= kind_limit)
13475 break;
13476 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013477 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013478 }
Victor Stinner55c99112011-10-13 01:17:06 +020013479 else
13480 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013481 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013482 if (result == NULL)
13483 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013484 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013485 dest_data = PyUnicode_DATA(result);
13486
13487 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013488 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13489 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013490 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013491 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013492 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013493 } else {
13494 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13495 return NULL;
13496 }
13497}
13498
13499static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013500 (lenfunc)unicode_length, /* mp_length */
13501 (binaryfunc)unicode_subscript, /* mp_subscript */
13502 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013503};
13504
Guido van Rossumd57fd912000-03-10 22:53:23 +000013505
Guido van Rossumd57fd912000-03-10 22:53:23 +000013506/* Helpers for PyUnicode_Format() */
13507
Victor Stinnera47082312012-10-04 02:19:54 +020013508struct unicode_formatter_t {
13509 PyObject *args;
13510 int args_owned;
13511 Py_ssize_t arglen, argidx;
13512 PyObject *dict;
13513
13514 enum PyUnicode_Kind fmtkind;
13515 Py_ssize_t fmtcnt, fmtpos;
13516 void *fmtdata;
13517 PyObject *fmtstr;
13518
13519 _PyUnicodeWriter writer;
13520};
13521
13522struct unicode_format_arg_t {
13523 Py_UCS4 ch;
13524 int flags;
13525 Py_ssize_t width;
13526 int prec;
13527 int sign;
13528};
13529
Guido van Rossumd57fd912000-03-10 22:53:23 +000013530static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013531unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013532{
Victor Stinnera47082312012-10-04 02:19:54 +020013533 Py_ssize_t argidx = ctx->argidx;
13534
13535 if (argidx < ctx->arglen) {
13536 ctx->argidx++;
13537 if (ctx->arglen < 0)
13538 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013539 else
Victor Stinnera47082312012-10-04 02:19:54 +020013540 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013541 }
13542 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013544 return NULL;
13545}
13546
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013547/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013548
Victor Stinnera47082312012-10-04 02:19:54 +020013549/* Format a float into the writer if the writer is not NULL, or into *p_output
13550 otherwise.
13551
13552 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013553static int
Victor Stinnera47082312012-10-04 02:19:54 +020013554formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13555 PyObject **p_output,
13556 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013557{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013558 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013559 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013561 int prec;
13562 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013563
Guido van Rossumd57fd912000-03-10 22:53:23 +000013564 x = PyFloat_AsDouble(v);
13565 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013566 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013567
Victor Stinnera47082312012-10-04 02:19:54 +020013568 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013569 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013570 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013571
Victor Stinnera47082312012-10-04 02:19:54 +020013572 if (arg->flags & F_ALT)
13573 dtoa_flags = Py_DTSF_ALT;
13574 else
13575 dtoa_flags = 0;
13576 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013577 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013578 return -1;
13579 len = strlen(p);
13580 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013581 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13582 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013583 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013584 }
Victor Stinner184252a2012-06-16 02:57:41 +020013585 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013586 writer->pos += len;
13587 }
13588 else
13589 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013590 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013591 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013592}
13593
Victor Stinnerd0880d52012-04-27 23:40:13 +020013594/* formatlong() emulates the format codes d, u, o, x and X, and
13595 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13596 * Python's regular ints.
13597 * Return value: a new PyUnicodeObject*, or NULL if error.
13598 * The output string is of the form
13599 * "-"? ("0x" | "0X")? digit+
13600 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13601 * set in flags. The case of hex digits will be correct,
13602 * There will be at least prec digits, zero-filled on the left if
13603 * necessary to get that many.
13604 * val object to be converted
13605 * flags bitmask of format flags; only F_ALT is looked at
13606 * prec minimum number of digits; 0-fill on left if needed
13607 * type a character in [duoxX]; u acts the same as d
13608 *
13609 * CAUTION: o, x and X conversions on regular ints can never
13610 * produce a '-' sign, but can for Python's unbounded ints.
13611 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013612static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013613formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013614{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013615 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013616 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013617 Py_ssize_t i;
13618 int sign; /* 1 if '-', else 0 */
13619 int len; /* number of characters */
13620 Py_ssize_t llen;
13621 int numdigits; /* len == numnondigits + numdigits */
13622 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013623 int prec = arg->prec;
13624 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013625
Victor Stinnerd0880d52012-04-27 23:40:13 +020013626 /* Avoid exceeding SSIZE_T_MAX */
13627 if (prec > INT_MAX-3) {
13628 PyErr_SetString(PyExc_OverflowError,
13629 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013630 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013631 }
13632
13633 assert(PyLong_Check(val));
13634
13635 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013636 default:
13637 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013638 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013639 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013640 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013641 /* int and int subclasses should print numerically when a numeric */
13642 /* format code is used (see issue18780) */
13643 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013644 break;
13645 case 'o':
13646 numnondigits = 2;
13647 result = PyNumber_ToBase(val, 8);
13648 break;
13649 case 'x':
13650 case 'X':
13651 numnondigits = 2;
13652 result = PyNumber_ToBase(val, 16);
13653 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013654 }
13655 if (!result)
13656 return NULL;
13657
13658 assert(unicode_modifiable(result));
13659 assert(PyUnicode_IS_READY(result));
13660 assert(PyUnicode_IS_ASCII(result));
13661
13662 /* To modify the string in-place, there can only be one reference. */
13663 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013664 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013665 PyErr_BadInternalCall();
13666 return NULL;
13667 }
13668 buf = PyUnicode_DATA(result);
13669 llen = PyUnicode_GET_LENGTH(result);
13670 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013671 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013672 PyErr_SetString(PyExc_ValueError,
13673 "string too large in _PyBytes_FormatLong");
13674 return NULL;
13675 }
13676 len = (int)llen;
13677 sign = buf[0] == '-';
13678 numnondigits += sign;
13679 numdigits = len - numnondigits;
13680 assert(numdigits > 0);
13681
13682 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013683 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013684 (type == 'o' || type == 'x' || type == 'X'))) {
13685 assert(buf[sign] == '0');
13686 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13687 buf[sign+1] == 'o');
13688 numnondigits -= 2;
13689 buf += 2;
13690 len -= 2;
13691 if (sign)
13692 buf[0] = '-';
13693 assert(len == numnondigits + numdigits);
13694 assert(numdigits > 0);
13695 }
13696
13697 /* Fill with leading zeroes to meet minimum width. */
13698 if (prec > numdigits) {
13699 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13700 numnondigits + prec);
13701 char *b1;
13702 if (!r1) {
13703 Py_DECREF(result);
13704 return NULL;
13705 }
13706 b1 = PyBytes_AS_STRING(r1);
13707 for (i = 0; i < numnondigits; ++i)
13708 *b1++ = *buf++;
13709 for (i = 0; i < prec - numdigits; i++)
13710 *b1++ = '0';
13711 for (i = 0; i < numdigits; i++)
13712 *b1++ = *buf++;
13713 *b1 = '\0';
13714 Py_DECREF(result);
13715 result = r1;
13716 buf = PyBytes_AS_STRING(result);
13717 len = numnondigits + prec;
13718 }
13719
13720 /* Fix up case for hex conversions. */
13721 if (type == 'X') {
13722 /* Need to convert all lower case letters to upper case.
13723 and need to convert 0x to 0X (and -0x to -0X). */
13724 for (i = 0; i < len; i++)
13725 if (buf[i] >= 'a' && buf[i] <= 'x')
13726 buf[i] -= 'a'-'A';
13727 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013728 if (!PyUnicode_Check(result)
13729 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013730 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013731 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013732 Py_DECREF(result);
13733 result = unicode;
13734 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013735 else if (len != PyUnicode_GET_LENGTH(result)) {
13736 if (PyUnicode_Resize(&result, len) < 0)
13737 Py_CLEAR(result);
13738 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013739 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013740}
13741
Victor Stinner621ef3d2012-10-02 00:33:47 +020013742/* Format an integer.
13743 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013744 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013745 * -1 and raise an exception on error */
13746static int
Victor Stinnera47082312012-10-04 02:19:54 +020013747mainformatlong(PyObject *v,
13748 struct unicode_format_arg_t *arg,
13749 PyObject **p_output,
13750 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013751{
13752 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013753 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013754
13755 if (!PyNumber_Check(v))
13756 goto wrongtype;
13757
13758 if (!PyLong_Check(v)) {
13759 iobj = PyNumber_Long(v);
13760 if (iobj == NULL) {
13761 if (PyErr_ExceptionMatches(PyExc_TypeError))
13762 goto wrongtype;
13763 return -1;
13764 }
13765 assert(PyLong_Check(iobj));
13766 }
13767 else {
13768 iobj = v;
13769 Py_INCREF(iobj);
13770 }
13771
13772 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013773 && arg->width == -1 && arg->prec == -1
13774 && !(arg->flags & (F_SIGN | F_BLANK))
13775 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013776 {
13777 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013778 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013779 int base;
13780
Victor Stinnera47082312012-10-04 02:19:54 +020013781 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013782 {
13783 default:
13784 assert(0 && "'type' not in [diuoxX]");
13785 case 'd':
13786 case 'i':
13787 case 'u':
13788 base = 10;
13789 break;
13790 case 'o':
13791 base = 8;
13792 break;
13793 case 'x':
13794 case 'X':
13795 base = 16;
13796 break;
13797 }
13798
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013799 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13800 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013801 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013802 }
13803 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013804 return 1;
13805 }
13806
Victor Stinnera47082312012-10-04 02:19:54 +020013807 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013808 Py_DECREF(iobj);
13809 if (res == NULL)
13810 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013811 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013812 return 0;
13813
13814wrongtype:
13815 PyErr_Format(PyExc_TypeError,
13816 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013817 "not %.200s",
13818 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013819 return -1;
13820}
13821
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013822static Py_UCS4
13823formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013824{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013825 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013826 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013827 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013828 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013829 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 goto onError;
13831 }
13832 else {
13833 /* Integer input truncated to a character */
13834 long x;
13835 x = PyLong_AsLong(v);
13836 if (x == -1 && PyErr_Occurred())
13837 goto onError;
13838
Victor Stinner8faf8212011-12-08 22:14:11 +010013839 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 PyErr_SetString(PyExc_OverflowError,
13841 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013842 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 }
13844
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013845 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013846 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013847
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013849 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013851 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013852}
13853
Victor Stinnera47082312012-10-04 02:19:54 +020013854/* Parse options of an argument: flags, width, precision.
13855 Handle also "%(name)" syntax.
13856
13857 Return 0 if the argument has been formatted into arg->str.
13858 Return 1 if the argument has been written into ctx->writer,
13859 Raise an exception and return -1 on error. */
13860static int
13861unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13862 struct unicode_format_arg_t *arg)
13863{
13864#define FORMAT_READ(ctx) \
13865 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13866
13867 PyObject *v;
13868
Victor Stinnera47082312012-10-04 02:19:54 +020013869 if (arg->ch == '(') {
13870 /* Get argument value from a dictionary. Example: "%(name)s". */
13871 Py_ssize_t keystart;
13872 Py_ssize_t keylen;
13873 PyObject *key;
13874 int pcount = 1;
13875
13876 if (ctx->dict == NULL) {
13877 PyErr_SetString(PyExc_TypeError,
13878 "format requires a mapping");
13879 return -1;
13880 }
13881 ++ctx->fmtpos;
13882 --ctx->fmtcnt;
13883 keystart = ctx->fmtpos;
13884 /* Skip over balanced parentheses */
13885 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13886 arg->ch = FORMAT_READ(ctx);
13887 if (arg->ch == ')')
13888 --pcount;
13889 else if (arg->ch == '(')
13890 ++pcount;
13891 ctx->fmtpos++;
13892 }
13893 keylen = ctx->fmtpos - keystart - 1;
13894 if (ctx->fmtcnt < 0 || pcount > 0) {
13895 PyErr_SetString(PyExc_ValueError,
13896 "incomplete format key");
13897 return -1;
13898 }
13899 key = PyUnicode_Substring(ctx->fmtstr,
13900 keystart, keystart + keylen);
13901 if (key == NULL)
13902 return -1;
13903 if (ctx->args_owned) {
13904 Py_DECREF(ctx->args);
13905 ctx->args_owned = 0;
13906 }
13907 ctx->args = PyObject_GetItem(ctx->dict, key);
13908 Py_DECREF(key);
13909 if (ctx->args == NULL)
13910 return -1;
13911 ctx->args_owned = 1;
13912 ctx->arglen = -1;
13913 ctx->argidx = -2;
13914 }
13915
13916 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013917 while (--ctx->fmtcnt >= 0) {
13918 arg->ch = FORMAT_READ(ctx);
13919 ctx->fmtpos++;
13920 switch (arg->ch) {
13921 case '-': arg->flags |= F_LJUST; continue;
13922 case '+': arg->flags |= F_SIGN; continue;
13923 case ' ': arg->flags |= F_BLANK; continue;
13924 case '#': arg->flags |= F_ALT; continue;
13925 case '0': arg->flags |= F_ZERO; continue;
13926 }
13927 break;
13928 }
13929
13930 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013931 if (arg->ch == '*') {
13932 v = unicode_format_getnextarg(ctx);
13933 if (v == NULL)
13934 return -1;
13935 if (!PyLong_Check(v)) {
13936 PyErr_SetString(PyExc_TypeError,
13937 "* wants int");
13938 return -1;
13939 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013940 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013941 if (arg->width == -1 && PyErr_Occurred())
13942 return -1;
13943 if (arg->width < 0) {
13944 arg->flags |= F_LJUST;
13945 arg->width = -arg->width;
13946 }
13947 if (--ctx->fmtcnt >= 0) {
13948 arg->ch = FORMAT_READ(ctx);
13949 ctx->fmtpos++;
13950 }
13951 }
13952 else if (arg->ch >= '0' && arg->ch <= '9') {
13953 arg->width = arg->ch - '0';
13954 while (--ctx->fmtcnt >= 0) {
13955 arg->ch = FORMAT_READ(ctx);
13956 ctx->fmtpos++;
13957 if (arg->ch < '0' || arg->ch > '9')
13958 break;
13959 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13960 mixing signed and unsigned comparison. Since arg->ch is between
13961 '0' and '9', casting to int is safe. */
13962 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13963 PyErr_SetString(PyExc_ValueError,
13964 "width too big");
13965 return -1;
13966 }
13967 arg->width = arg->width*10 + (arg->ch - '0');
13968 }
13969 }
13970
13971 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013972 if (arg->ch == '.') {
13973 arg->prec = 0;
13974 if (--ctx->fmtcnt >= 0) {
13975 arg->ch = FORMAT_READ(ctx);
13976 ctx->fmtpos++;
13977 }
13978 if (arg->ch == '*') {
13979 v = unicode_format_getnextarg(ctx);
13980 if (v == NULL)
13981 return -1;
13982 if (!PyLong_Check(v)) {
13983 PyErr_SetString(PyExc_TypeError,
13984 "* wants int");
13985 return -1;
13986 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013987 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013988 if (arg->prec == -1 && PyErr_Occurred())
13989 return -1;
13990 if (arg->prec < 0)
13991 arg->prec = 0;
13992 if (--ctx->fmtcnt >= 0) {
13993 arg->ch = FORMAT_READ(ctx);
13994 ctx->fmtpos++;
13995 }
13996 }
13997 else if (arg->ch >= '0' && arg->ch <= '9') {
13998 arg->prec = arg->ch - '0';
13999 while (--ctx->fmtcnt >= 0) {
14000 arg->ch = FORMAT_READ(ctx);
14001 ctx->fmtpos++;
14002 if (arg->ch < '0' || arg->ch > '9')
14003 break;
14004 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14005 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014006 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014007 return -1;
14008 }
14009 arg->prec = arg->prec*10 + (arg->ch - '0');
14010 }
14011 }
14012 }
14013
14014 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14015 if (ctx->fmtcnt >= 0) {
14016 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14017 if (--ctx->fmtcnt >= 0) {
14018 arg->ch = FORMAT_READ(ctx);
14019 ctx->fmtpos++;
14020 }
14021 }
14022 }
14023 if (ctx->fmtcnt < 0) {
14024 PyErr_SetString(PyExc_ValueError,
14025 "incomplete format");
14026 return -1;
14027 }
14028 return 0;
14029
14030#undef FORMAT_READ
14031}
14032
14033/* Format one argument. Supported conversion specifiers:
14034
14035 - "s", "r", "a": any type
14036 - "i", "d", "u", "o", "x", "X": int
14037 - "e", "E", "f", "F", "g", "G": float
14038 - "c": int or str (1 character)
14039
Victor Stinner8dbd4212012-12-04 09:30:24 +010014040 When possible, the output is written directly into the Unicode writer
14041 (ctx->writer). A string is created when padding is required.
14042
Victor Stinnera47082312012-10-04 02:19:54 +020014043 Return 0 if the argument has been formatted into *p_str,
14044 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014045 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014046static int
14047unicode_format_arg_format(struct unicode_formatter_t *ctx,
14048 struct unicode_format_arg_t *arg,
14049 PyObject **p_str)
14050{
14051 PyObject *v;
14052 _PyUnicodeWriter *writer = &ctx->writer;
14053
14054 if (ctx->fmtcnt == 0)
14055 ctx->writer.overallocate = 0;
14056
14057 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014058 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014059 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014060 return 1;
14061 }
14062
14063 v = unicode_format_getnextarg(ctx);
14064 if (v == NULL)
14065 return -1;
14066
Victor Stinnera47082312012-10-04 02:19:54 +020014067
14068 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014069 case 's':
14070 case 'r':
14071 case 'a':
14072 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14073 /* Fast path */
14074 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14075 return -1;
14076 return 1;
14077 }
14078
14079 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14080 *p_str = v;
14081 Py_INCREF(*p_str);
14082 }
14083 else {
14084 if (arg->ch == 's')
14085 *p_str = PyObject_Str(v);
14086 else if (arg->ch == 'r')
14087 *p_str = PyObject_Repr(v);
14088 else
14089 *p_str = PyObject_ASCII(v);
14090 }
14091 break;
14092
14093 case 'i':
14094 case 'd':
14095 case 'u':
14096 case 'o':
14097 case 'x':
14098 case 'X':
14099 {
14100 int ret = mainformatlong(v, arg, p_str, writer);
14101 if (ret != 0)
14102 return ret;
14103 arg->sign = 1;
14104 break;
14105 }
14106
14107 case 'e':
14108 case 'E':
14109 case 'f':
14110 case 'F':
14111 case 'g':
14112 case 'G':
14113 if (arg->width == -1 && arg->prec == -1
14114 && !(arg->flags & (F_SIGN | F_BLANK)))
14115 {
14116 /* Fast path */
14117 if (formatfloat(v, arg, NULL, writer) == -1)
14118 return -1;
14119 return 1;
14120 }
14121
14122 arg->sign = 1;
14123 if (formatfloat(v, arg, p_str, NULL) == -1)
14124 return -1;
14125 break;
14126
14127 case 'c':
14128 {
14129 Py_UCS4 ch = formatchar(v);
14130 if (ch == (Py_UCS4) -1)
14131 return -1;
14132 if (arg->width == -1 && arg->prec == -1) {
14133 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014134 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014135 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014136 return 1;
14137 }
14138 *p_str = PyUnicode_FromOrdinal(ch);
14139 break;
14140 }
14141
14142 default:
14143 PyErr_Format(PyExc_ValueError,
14144 "unsupported format character '%c' (0x%x) "
14145 "at index %zd",
14146 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14147 (int)arg->ch,
14148 ctx->fmtpos - 1);
14149 return -1;
14150 }
14151 if (*p_str == NULL)
14152 return -1;
14153 assert (PyUnicode_Check(*p_str));
14154 return 0;
14155}
14156
14157static int
14158unicode_format_arg_output(struct unicode_formatter_t *ctx,
14159 struct unicode_format_arg_t *arg,
14160 PyObject *str)
14161{
14162 Py_ssize_t len;
14163 enum PyUnicode_Kind kind;
14164 void *pbuf;
14165 Py_ssize_t pindex;
14166 Py_UCS4 signchar;
14167 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014168 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014169 Py_ssize_t sublen;
14170 _PyUnicodeWriter *writer = &ctx->writer;
14171 Py_UCS4 fill;
14172
14173 fill = ' ';
14174 if (arg->sign && arg->flags & F_ZERO)
14175 fill = '0';
14176
14177 if (PyUnicode_READY(str) == -1)
14178 return -1;
14179
14180 len = PyUnicode_GET_LENGTH(str);
14181 if ((arg->width == -1 || arg->width <= len)
14182 && (arg->prec == -1 || arg->prec >= len)
14183 && !(arg->flags & (F_SIGN | F_BLANK)))
14184 {
14185 /* Fast path */
14186 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14187 return -1;
14188 return 0;
14189 }
14190
14191 /* Truncate the string for "s", "r" and "a" formats
14192 if the precision is set */
14193 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14194 if (arg->prec >= 0 && len > arg->prec)
14195 len = arg->prec;
14196 }
14197
14198 /* Adjust sign and width */
14199 kind = PyUnicode_KIND(str);
14200 pbuf = PyUnicode_DATA(str);
14201 pindex = 0;
14202 signchar = '\0';
14203 if (arg->sign) {
14204 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14205 if (ch == '-' || ch == '+') {
14206 signchar = ch;
14207 len--;
14208 pindex++;
14209 }
14210 else if (arg->flags & F_SIGN)
14211 signchar = '+';
14212 else if (arg->flags & F_BLANK)
14213 signchar = ' ';
14214 else
14215 arg->sign = 0;
14216 }
14217 if (arg->width < len)
14218 arg->width = len;
14219
14220 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014221 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014222 if (!(arg->flags & F_LJUST)) {
14223 if (arg->sign) {
14224 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014225 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014226 }
14227 else {
14228 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014229 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014230 }
14231 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014232 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14233 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014234 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014235 }
14236
Victor Stinnera47082312012-10-04 02:19:54 +020014237 buflen = arg->width;
14238 if (arg->sign && len == arg->width)
14239 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014240 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014241 return -1;
14242
14243 /* Write the sign if needed */
14244 if (arg->sign) {
14245 if (fill != ' ') {
14246 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14247 writer->pos += 1;
14248 }
14249 if (arg->width > len)
14250 arg->width--;
14251 }
14252
14253 /* Write the numeric prefix for "x", "X" and "o" formats
14254 if the alternate form is used.
14255 For example, write "0x" for the "%#x" format. */
14256 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14257 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14258 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14259 if (fill != ' ') {
14260 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14261 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14262 writer->pos += 2;
14263 pindex += 2;
14264 }
14265 arg->width -= 2;
14266 if (arg->width < 0)
14267 arg->width = 0;
14268 len -= 2;
14269 }
14270
14271 /* Pad left with the fill character if needed */
14272 if (arg->width > len && !(arg->flags & F_LJUST)) {
14273 sublen = arg->width - len;
14274 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14275 writer->pos += sublen;
14276 arg->width = len;
14277 }
14278
14279 /* If padding with spaces: write sign if needed and/or numeric prefix if
14280 the alternate form is used */
14281 if (fill == ' ') {
14282 if (arg->sign) {
14283 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14284 writer->pos += 1;
14285 }
14286 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14287 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14288 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14289 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14290 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14291 writer->pos += 2;
14292 pindex += 2;
14293 }
14294 }
14295
14296 /* Write characters */
14297 if (len) {
14298 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14299 str, pindex, len);
14300 writer->pos += len;
14301 }
14302
14303 /* Pad right with the fill character if needed */
14304 if (arg->width > len) {
14305 sublen = arg->width - len;
14306 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14307 writer->pos += sublen;
14308 }
14309 return 0;
14310}
14311
14312/* Helper of PyUnicode_Format(): format one arg.
14313 Return 0 on success, raise an exception and return -1 on error. */
14314static int
14315unicode_format_arg(struct unicode_formatter_t *ctx)
14316{
14317 struct unicode_format_arg_t arg;
14318 PyObject *str;
14319 int ret;
14320
Victor Stinner8dbd4212012-12-04 09:30:24 +010014321 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14322 arg.flags = 0;
14323 arg.width = -1;
14324 arg.prec = -1;
14325 arg.sign = 0;
14326 str = NULL;
14327
Victor Stinnera47082312012-10-04 02:19:54 +020014328 ret = unicode_format_arg_parse(ctx, &arg);
14329 if (ret == -1)
14330 return -1;
14331
14332 ret = unicode_format_arg_format(ctx, &arg, &str);
14333 if (ret == -1)
14334 return -1;
14335
14336 if (ret != 1) {
14337 ret = unicode_format_arg_output(ctx, &arg, str);
14338 Py_DECREF(str);
14339 if (ret == -1)
14340 return -1;
14341 }
14342
14343 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14344 PyErr_SetString(PyExc_TypeError,
14345 "not all arguments converted during string formatting");
14346 return -1;
14347 }
14348 return 0;
14349}
14350
Alexander Belopolsky40018472011-02-26 01:02:56 +000014351PyObject *
14352PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014353{
Victor Stinnera47082312012-10-04 02:19:54 +020014354 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014355
Guido van Rossumd57fd912000-03-10 22:53:23 +000014356 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014357 PyErr_BadInternalCall();
14358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014359 }
Victor Stinnera47082312012-10-04 02:19:54 +020014360
14361 ctx.fmtstr = PyUnicode_FromObject(format);
14362 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014363 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014364 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14365 Py_DECREF(ctx.fmtstr);
14366 return NULL;
14367 }
14368 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14369 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14370 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14371 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014372
Victor Stinner8f674cc2013-04-17 23:02:17 +020014373 _PyUnicodeWriter_Init(&ctx.writer);
14374 ctx.writer.min_length = ctx.fmtcnt + 100;
14375 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014376
Guido van Rossumd57fd912000-03-10 22:53:23 +000014377 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014378 ctx.arglen = PyTuple_Size(args);
14379 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014380 }
14381 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014382 ctx.arglen = -1;
14383 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014384 }
Victor Stinnera47082312012-10-04 02:19:54 +020014385 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014386 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014387 ctx.dict = args;
14388 else
14389 ctx.dict = NULL;
14390 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014391
Victor Stinnera47082312012-10-04 02:19:54 +020014392 while (--ctx.fmtcnt >= 0) {
14393 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014394 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014395
14396 nonfmtpos = ctx.fmtpos++;
14397 while (ctx.fmtcnt >= 0 &&
14398 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14399 ctx.fmtpos++;
14400 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014401 }
Victor Stinnera47082312012-10-04 02:19:54 +020014402 if (ctx.fmtcnt < 0) {
14403 ctx.fmtpos--;
14404 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014405 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014406
Victor Stinnercfc4c132013-04-03 01:48:39 +020014407 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14408 nonfmtpos, ctx.fmtpos) < 0)
14409 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014410 }
14411 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014412 ctx.fmtpos++;
14413 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014415 }
14416 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014417
Victor Stinnera47082312012-10-04 02:19:54 +020014418 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014419 PyErr_SetString(PyExc_TypeError,
14420 "not all arguments converted during string formatting");
14421 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014422 }
14423
Victor Stinnera47082312012-10-04 02:19:54 +020014424 if (ctx.args_owned) {
14425 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014426 }
Victor Stinnera47082312012-10-04 02:19:54 +020014427 Py_DECREF(ctx.fmtstr);
14428 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014429
Benjamin Peterson29060642009-01-31 22:14:21 +000014430 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014431 Py_DECREF(ctx.fmtstr);
14432 _PyUnicodeWriter_Dealloc(&ctx.writer);
14433 if (ctx.args_owned) {
14434 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014435 }
14436 return NULL;
14437}
14438
Jeremy Hylton938ace62002-07-17 16:30:39 +000014439static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014440unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14441
Tim Peters6d6c1a32001-08-02 04:15:00 +000014442static PyObject *
14443unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14444{
Benjamin Peterson29060642009-01-31 22:14:21 +000014445 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014446 static char *kwlist[] = {"object", "encoding", "errors", 0};
14447 char *encoding = NULL;
14448 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014449
Benjamin Peterson14339b62009-01-31 16:36:08 +000014450 if (type != &PyUnicode_Type)
14451 return unicode_subtype_new(type, args, kwds);
14452 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014453 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014454 return NULL;
14455 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014456 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014457 if (encoding == NULL && errors == NULL)
14458 return PyObject_Str(x);
14459 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014460 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014461}
14462
Guido van Rossume023fe02001-08-30 03:12:59 +000014463static PyObject *
14464unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14465{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014466 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014467 Py_ssize_t length, char_size;
14468 int share_wstr, share_utf8;
14469 unsigned int kind;
14470 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014471
Benjamin Peterson14339b62009-01-31 16:36:08 +000014472 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014473
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014474 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014475 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014476 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014477 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014478 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014479 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014480 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014481 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014482
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014483 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014484 if (self == NULL) {
14485 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014486 return NULL;
14487 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014488 kind = PyUnicode_KIND(unicode);
14489 length = PyUnicode_GET_LENGTH(unicode);
14490
14491 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014492#ifdef Py_DEBUG
14493 _PyUnicode_HASH(self) = -1;
14494#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014495 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014496#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014497 _PyUnicode_STATE(self).interned = 0;
14498 _PyUnicode_STATE(self).kind = kind;
14499 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014500 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014501 _PyUnicode_STATE(self).ready = 1;
14502 _PyUnicode_WSTR(self) = NULL;
14503 _PyUnicode_UTF8_LENGTH(self) = 0;
14504 _PyUnicode_UTF8(self) = NULL;
14505 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014506 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014507
14508 share_utf8 = 0;
14509 share_wstr = 0;
14510 if (kind == PyUnicode_1BYTE_KIND) {
14511 char_size = 1;
14512 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14513 share_utf8 = 1;
14514 }
14515 else if (kind == PyUnicode_2BYTE_KIND) {
14516 char_size = 2;
14517 if (sizeof(wchar_t) == 2)
14518 share_wstr = 1;
14519 }
14520 else {
14521 assert(kind == PyUnicode_4BYTE_KIND);
14522 char_size = 4;
14523 if (sizeof(wchar_t) == 4)
14524 share_wstr = 1;
14525 }
14526
14527 /* Ensure we won't overflow the length. */
14528 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14529 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014530 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014531 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014532 data = PyObject_MALLOC((length + 1) * char_size);
14533 if (data == NULL) {
14534 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014535 goto onError;
14536 }
14537
Victor Stinnerc3c74152011-10-02 20:39:55 +020014538 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014539 if (share_utf8) {
14540 _PyUnicode_UTF8_LENGTH(self) = length;
14541 _PyUnicode_UTF8(self) = data;
14542 }
14543 if (share_wstr) {
14544 _PyUnicode_WSTR_LENGTH(self) = length;
14545 _PyUnicode_WSTR(self) = (wchar_t *)data;
14546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014547
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014548 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014549 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014550 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014551#ifdef Py_DEBUG
14552 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14553#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014554 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014555 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014556
14557onError:
14558 Py_DECREF(unicode);
14559 Py_DECREF(self);
14560 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014561}
14562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014563PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014564"str(object='') -> str\n\
14565str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014566\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014567Create a new string object from the given object. If encoding or\n\
14568errors is specified, then the object must expose a data buffer\n\
14569that will be decoded using the given encoding and error handler.\n\
14570Otherwise, returns the result of object.__str__() (if defined)\n\
14571or repr(object).\n\
14572encoding defaults to sys.getdefaultencoding().\n\
14573errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014574
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014575static PyObject *unicode_iter(PyObject *seq);
14576
Guido van Rossumd57fd912000-03-10 22:53:23 +000014577PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014578 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014579 "str", /* tp_name */
14580 sizeof(PyUnicodeObject), /* tp_size */
14581 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014582 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014583 (destructor)unicode_dealloc, /* tp_dealloc */
14584 0, /* tp_print */
14585 0, /* tp_getattr */
14586 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014587 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014588 unicode_repr, /* tp_repr */
14589 &unicode_as_number, /* tp_as_number */
14590 &unicode_as_sequence, /* tp_as_sequence */
14591 &unicode_as_mapping, /* tp_as_mapping */
14592 (hashfunc) unicode_hash, /* tp_hash*/
14593 0, /* tp_call*/
14594 (reprfunc) unicode_str, /* tp_str */
14595 PyObject_GenericGetAttr, /* tp_getattro */
14596 0, /* tp_setattro */
14597 0, /* tp_as_buffer */
14598 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014599 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014600 unicode_doc, /* tp_doc */
14601 0, /* tp_traverse */
14602 0, /* tp_clear */
14603 PyUnicode_RichCompare, /* tp_richcompare */
14604 0, /* tp_weaklistoffset */
14605 unicode_iter, /* tp_iter */
14606 0, /* tp_iternext */
14607 unicode_methods, /* tp_methods */
14608 0, /* tp_members */
14609 0, /* tp_getset */
14610 &PyBaseObject_Type, /* tp_base */
14611 0, /* tp_dict */
14612 0, /* tp_descr_get */
14613 0, /* tp_descr_set */
14614 0, /* tp_dictoffset */
14615 0, /* tp_init */
14616 0, /* tp_alloc */
14617 unicode_new, /* tp_new */
14618 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014619};
14620
14621/* Initialize the Unicode implementation */
14622
Victor Stinner3a50e702011-10-18 21:21:00 +020014623int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014624{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014625 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014626 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014627 0x000A, /* LINE FEED */
14628 0x000D, /* CARRIAGE RETURN */
14629 0x001C, /* FILE SEPARATOR */
14630 0x001D, /* GROUP SEPARATOR */
14631 0x001E, /* RECORD SEPARATOR */
14632 0x0085, /* NEXT LINE */
14633 0x2028, /* LINE SEPARATOR */
14634 0x2029, /* PARAGRAPH SEPARATOR */
14635 };
14636
Fred Drakee4315f52000-05-09 19:53:39 +000014637 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014638 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014639 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014640 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014641 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014642
Guido van Rossumcacfc072002-05-24 19:01:59 +000014643 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014644 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014645
14646 /* initialize the linebreak bloom filter */
14647 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014648 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014649 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014650
Christian Heimes26532f72013-07-20 14:57:16 +020014651 if (PyType_Ready(&EncodingMapType) < 0)
14652 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014653
Benjamin Petersonc4311282012-10-30 23:21:10 -040014654 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14655 Py_FatalError("Can't initialize field name iterator type");
14656
14657 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14658 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014659
Victor Stinner3a50e702011-10-18 21:21:00 +020014660#ifdef HAVE_MBCS
14661 winver.dwOSVersionInfoSize = sizeof(winver);
14662 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14663 PyErr_SetFromWindowsErr(0);
14664 return -1;
14665 }
14666#endif
14667 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014668}
14669
14670/* Finalize the Unicode implementation */
14671
Christian Heimesa156e092008-02-16 07:38:31 +000014672int
14673PyUnicode_ClearFreeList(void)
14674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014675 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014676}
14677
Guido van Rossumd57fd912000-03-10 22:53:23 +000014678void
Thomas Wouters78890102000-07-22 19:25:51 +000014679_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014680{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014681 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014682
Serhiy Storchaka05997252013-01-26 12:14:02 +020014683 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014684
Serhiy Storchaka05997252013-01-26 12:14:02 +020014685 for (i = 0; i < 256; i++)
14686 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014687 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014688 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014689}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014690
Walter Dörwald16807132007-05-25 13:52:07 +000014691void
14692PyUnicode_InternInPlace(PyObject **p)
14693{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014694 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014695 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014696#ifdef Py_DEBUG
14697 assert(s != NULL);
14698 assert(_PyUnicode_CHECK(s));
14699#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014700 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014701 return;
14702#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014703 /* If it's a subclass, we don't really know what putting
14704 it in the interned dict might do. */
14705 if (!PyUnicode_CheckExact(s))
14706 return;
14707 if (PyUnicode_CHECK_INTERNED(s))
14708 return;
14709 if (interned == NULL) {
14710 interned = PyDict_New();
14711 if (interned == NULL) {
14712 PyErr_Clear(); /* Don't leave an exception */
14713 return;
14714 }
14715 }
14716 /* It might be that the GetItem call fails even
14717 though the key is present in the dictionary,
14718 namely when this happens during a stack overflow. */
14719 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014720 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014721 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014722
Victor Stinnerf0335102013-04-14 19:13:03 +020014723 if (t) {
14724 Py_INCREF(t);
14725 Py_DECREF(*p);
14726 *p = t;
14727 return;
14728 }
Walter Dörwald16807132007-05-25 13:52:07 +000014729
Benjamin Peterson14339b62009-01-31 16:36:08 +000014730 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014731 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014732 PyErr_Clear();
14733 PyThreadState_GET()->recursion_critical = 0;
14734 return;
14735 }
14736 PyThreadState_GET()->recursion_critical = 0;
14737 /* The two references in interned are not counted by refcnt.
14738 The deallocator will take care of this */
14739 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014740 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014741}
14742
14743void
14744PyUnicode_InternImmortal(PyObject **p)
14745{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014746 PyUnicode_InternInPlace(p);
14747 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014748 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014749 Py_INCREF(*p);
14750 }
Walter Dörwald16807132007-05-25 13:52:07 +000014751}
14752
14753PyObject *
14754PyUnicode_InternFromString(const char *cp)
14755{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014756 PyObject *s = PyUnicode_FromString(cp);
14757 if (s == NULL)
14758 return NULL;
14759 PyUnicode_InternInPlace(&s);
14760 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014761}
14762
Alexander Belopolsky40018472011-02-26 01:02:56 +000014763void
14764_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014765{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014766 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014767 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014768 Py_ssize_t i, n;
14769 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014770
Benjamin Peterson14339b62009-01-31 16:36:08 +000014771 if (interned == NULL || !PyDict_Check(interned))
14772 return;
14773 keys = PyDict_Keys(interned);
14774 if (keys == NULL || !PyList_Check(keys)) {
14775 PyErr_Clear();
14776 return;
14777 }
Walter Dörwald16807132007-05-25 13:52:07 +000014778
Benjamin Peterson14339b62009-01-31 16:36:08 +000014779 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14780 detector, interned unicode strings are not forcibly deallocated;
14781 rather, we give them their stolen references back, and then clear
14782 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014783
Benjamin Peterson14339b62009-01-31 16:36:08 +000014784 n = PyList_GET_SIZE(keys);
14785 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014786 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014787 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014788 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014789 if (PyUnicode_READY(s) == -1) {
14790 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014791 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014793 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014794 case SSTATE_NOT_INTERNED:
14795 /* XXX Shouldn't happen */
14796 break;
14797 case SSTATE_INTERNED_IMMORTAL:
14798 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014799 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014800 break;
14801 case SSTATE_INTERNED_MORTAL:
14802 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014803 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014804 break;
14805 default:
14806 Py_FatalError("Inconsistent interned string state.");
14807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014808 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014809 }
14810 fprintf(stderr, "total size of all interned strings: "
14811 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14812 "mortal/immortal\n", mortal_size, immortal_size);
14813 Py_DECREF(keys);
14814 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014815 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014816}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014817
14818
14819/********************* Unicode Iterator **************************/
14820
14821typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014822 PyObject_HEAD
14823 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014824 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014825} unicodeiterobject;
14826
14827static void
14828unicodeiter_dealloc(unicodeiterobject *it)
14829{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014830 _PyObject_GC_UNTRACK(it);
14831 Py_XDECREF(it->it_seq);
14832 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014833}
14834
14835static int
14836unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14837{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014838 Py_VISIT(it->it_seq);
14839 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014840}
14841
14842static PyObject *
14843unicodeiter_next(unicodeiterobject *it)
14844{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014845 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014846
Benjamin Peterson14339b62009-01-31 16:36:08 +000014847 assert(it != NULL);
14848 seq = it->it_seq;
14849 if (seq == NULL)
14850 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014851 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014853 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14854 int kind = PyUnicode_KIND(seq);
14855 void *data = PyUnicode_DATA(seq);
14856 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14857 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014858 if (item != NULL)
14859 ++it->it_index;
14860 return item;
14861 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014862
Benjamin Peterson14339b62009-01-31 16:36:08 +000014863 Py_DECREF(seq);
14864 it->it_seq = NULL;
14865 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014866}
14867
14868static PyObject *
14869unicodeiter_len(unicodeiterobject *it)
14870{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014871 Py_ssize_t len = 0;
14872 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014873 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014874 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014875}
14876
14877PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14878
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014879static PyObject *
14880unicodeiter_reduce(unicodeiterobject *it)
14881{
14882 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014883 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014884 it->it_seq, it->it_index);
14885 } else {
14886 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14887 if (u == NULL)
14888 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014889 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014890 }
14891}
14892
14893PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14894
14895static PyObject *
14896unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14897{
14898 Py_ssize_t index = PyLong_AsSsize_t(state);
14899 if (index == -1 && PyErr_Occurred())
14900 return NULL;
14901 if (index < 0)
14902 index = 0;
14903 it->it_index = index;
14904 Py_RETURN_NONE;
14905}
14906
14907PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14908
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014909static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014910 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014911 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014912 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14913 reduce_doc},
14914 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14915 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014916 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014917};
14918
14919PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014920 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14921 "str_iterator", /* tp_name */
14922 sizeof(unicodeiterobject), /* tp_basicsize */
14923 0, /* tp_itemsize */
14924 /* methods */
14925 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14926 0, /* tp_print */
14927 0, /* tp_getattr */
14928 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014929 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014930 0, /* tp_repr */
14931 0, /* tp_as_number */
14932 0, /* tp_as_sequence */
14933 0, /* tp_as_mapping */
14934 0, /* tp_hash */
14935 0, /* tp_call */
14936 0, /* tp_str */
14937 PyObject_GenericGetAttr, /* tp_getattro */
14938 0, /* tp_setattro */
14939 0, /* tp_as_buffer */
14940 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14941 0, /* tp_doc */
14942 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14943 0, /* tp_clear */
14944 0, /* tp_richcompare */
14945 0, /* tp_weaklistoffset */
14946 PyObject_SelfIter, /* tp_iter */
14947 (iternextfunc)unicodeiter_next, /* tp_iternext */
14948 unicodeiter_methods, /* tp_methods */
14949 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014950};
14951
14952static PyObject *
14953unicode_iter(PyObject *seq)
14954{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014955 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014956
Benjamin Peterson14339b62009-01-31 16:36:08 +000014957 if (!PyUnicode_Check(seq)) {
14958 PyErr_BadInternalCall();
14959 return NULL;
14960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014961 if (PyUnicode_READY(seq) == -1)
14962 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014963 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14964 if (it == NULL)
14965 return NULL;
14966 it->it_index = 0;
14967 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014968 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014969 _PyObject_GC_TRACK(it);
14970 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014971}
14972
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014973
14974size_t
14975Py_UNICODE_strlen(const Py_UNICODE *u)
14976{
14977 int res = 0;
14978 while(*u++)
14979 res++;
14980 return res;
14981}
14982
14983Py_UNICODE*
14984Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14985{
14986 Py_UNICODE *u = s1;
14987 while ((*u++ = *s2++));
14988 return s1;
14989}
14990
14991Py_UNICODE*
14992Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14993{
14994 Py_UNICODE *u = s1;
14995 while ((*u++ = *s2++))
14996 if (n-- == 0)
14997 break;
14998 return s1;
14999}
15000
15001Py_UNICODE*
15002Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15003{
15004 Py_UNICODE *u1 = s1;
15005 u1 += Py_UNICODE_strlen(u1);
15006 Py_UNICODE_strcpy(u1, s2);
15007 return s1;
15008}
15009
15010int
15011Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15012{
15013 while (*s1 && *s2 && *s1 == *s2)
15014 s1++, s2++;
15015 if (*s1 && *s2)
15016 return (*s1 < *s2) ? -1 : +1;
15017 if (*s1)
15018 return 1;
15019 if (*s2)
15020 return -1;
15021 return 0;
15022}
15023
15024int
15025Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15026{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015027 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015028 for (; n != 0; n--) {
15029 u1 = *s1;
15030 u2 = *s2;
15031 if (u1 != u2)
15032 return (u1 < u2) ? -1 : +1;
15033 if (u1 == '\0')
15034 return 0;
15035 s1++;
15036 s2++;
15037 }
15038 return 0;
15039}
15040
15041Py_UNICODE*
15042Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15043{
15044 const Py_UNICODE *p;
15045 for (p = s; *p; p++)
15046 if (*p == c)
15047 return (Py_UNICODE*)p;
15048 return NULL;
15049}
15050
15051Py_UNICODE*
15052Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15053{
15054 const Py_UNICODE *p;
15055 p = s + Py_UNICODE_strlen(s);
15056 while (p != s) {
15057 p--;
15058 if (*p == c)
15059 return (Py_UNICODE*)p;
15060 }
15061 return NULL;
15062}
Victor Stinner331ea922010-08-10 16:37:20 +000015063
Victor Stinner71133ff2010-09-01 23:43:53 +000015064Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015065PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015066{
Victor Stinner577db2c2011-10-11 22:12:48 +020015067 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015068 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015070 if (!PyUnicode_Check(unicode)) {
15071 PyErr_BadArgument();
15072 return NULL;
15073 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015074 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015075 if (u == NULL)
15076 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015077 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015078 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015079 PyErr_NoMemory();
15080 return NULL;
15081 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015082 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015083 size *= sizeof(Py_UNICODE);
15084 copy = PyMem_Malloc(size);
15085 if (copy == NULL) {
15086 PyErr_NoMemory();
15087 return NULL;
15088 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015089 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015090 return copy;
15091}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015092
Georg Brandl66c221e2010-10-14 07:04:07 +000015093/* A _string module, to export formatter_parser and formatter_field_name_split
15094 to the string.Formatter class implemented in Python. */
15095
15096static PyMethodDef _string_methods[] = {
15097 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15098 METH_O, PyDoc_STR("split the argument as a field name")},
15099 {"formatter_parser", (PyCFunction) formatter_parser,
15100 METH_O, PyDoc_STR("parse the argument as a format string")},
15101 {NULL, NULL}
15102};
15103
15104static struct PyModuleDef _string_module = {
15105 PyModuleDef_HEAD_INIT,
15106 "_string",
15107 PyDoc_STR("string helper module"),
15108 0,
15109 _string_methods,
15110 NULL,
15111 NULL,
15112 NULL,
15113 NULL
15114};
15115
15116PyMODINIT_FUNC
15117PyInit__string(void)
15118{
15119 return PyModule_Create(&_string_module);
15120}
15121
15122
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015123#ifdef __cplusplus
15124}
15125#endif