blob: 6dc583517c2acf4f75a26839c5dd94118719ef98 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinner910337b2011-10-03 03:20:16 +0200107#undef PyUnicode_READY
108#define PyUnicode_READY(op) \
109 (assert(_PyUnicode_CHECK(op)), \
110 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200111 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100112 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200113
Victor Stinnerc379ead2011-10-03 12:52:27 +0200114#define _PyUnicode_SHARE_UTF8(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
117 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
118#define _PyUnicode_SHARE_WSTR(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
121
Victor Stinner829c0ad2011-10-03 01:08:02 +0200122/* true if the Unicode object has an allocated UTF-8 memory block
123 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200125 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200126 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
128
Victor Stinner03490912011-10-03 23:45:12 +0200129/* true if the Unicode object has an allocated wstr memory block
130 (not shared with other data) */
131#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200132 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200133 (!PyUnicode_IS_READY(op) || \
134 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
135
Victor Stinner910337b2011-10-03 03:20:16 +0200136/* Generic helper macro to convert characters of different types.
137 from_type and to_type have to be valid type names, begin and end
138 are pointers to the source characters which should be of type
139 "from_type *". to is a pointer of type "to_type *" and points to the
140 buffer where the result characters are written to. */
141#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
142 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200143 to_type *_to = (to_type *) to; \
144 const from_type *_iter = (begin); \
145 const from_type *_end = (end); \
146 Py_ssize_t n = (_end) - (_iter); \
147 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200148 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 while (_iter < (_unrolled_end)) { \
150 _to[0] = (to_type) _iter[0]; \
151 _to[1] = (to_type) _iter[1]; \
152 _to[2] = (to_type) _iter[2]; \
153 _to[3] = (to_type) _iter[3]; \
154 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200155 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_end)) \
157 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200158 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159
Walter Dörwald16807132007-05-25 13:52:07 +0000160/* This dictionary holds all interned unicode strings. Note that references
161 to strings in this dictionary are *not* counted in the string's ob_refcnt.
162 When the interned string reaches a refcnt of 0 the string deallocation
163 function will delete the reference from this dictionary.
164
165 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000166 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000167*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200168static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000169
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000170/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200171static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200172
Serhiy Storchaka678db842013-01-26 12:16:36 +0200173#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200174 do { \
175 if (unicode_empty != NULL) \
176 Py_INCREF(unicode_empty); \
177 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178 unicode_empty = PyUnicode_New(0, 0); \
179 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200180 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
182 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186#define _Py_RETURN_UNICODE_EMPTY() \
187 do { \
188 _Py_INCREF_UNICODE_EMPTY(); \
189 return unicode_empty; \
190 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200192/* Forward declaration */
193Py_LOCAL_INLINE(int)
194_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
195
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200196/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200197static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* Single character Unicode strings in the Latin-1 range are being
200 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202
Christian Heimes190d79e2008-01-30 11:58:22 +0000203/* Fast detection of the most frequent whitespace characters */
204const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000206/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000208/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x000C: * FORM FEED */
210/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 1, 1, 1, 1, 1, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x001C: * FILE SEPARATOR */
214/* case 0x001D: * GROUP SEPARATOR */
215/* case 0x001E: * RECORD SEPARATOR */
216/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 1, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000223
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000232};
233
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200234/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200235static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200236static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100237static int unicode_modifiable(PyObject *unicode);
238
Victor Stinnerfe226c02011-10-03 03:52:20 +0200239
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100241_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200242static PyObject *
243_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
244static PyObject *
245_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
246
247static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000248unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000249 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100250 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000251 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
252
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253static void
254raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300255 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100256 PyObject *unicode,
257 Py_ssize_t startpos, Py_ssize_t endpos,
258 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000259
Christian Heimes190d79e2008-01-30 11:58:22 +0000260/* Same for linebreaks */
261static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000262 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264/* 0x000B, * LINE TABULATION */
265/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x001C, * FILE SEPARATOR */
270/* 0x001D, * GROUP SEPARATOR */
271/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000272 0, 0, 0, 0, 1, 1, 1, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000277
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000286};
287
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300288/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
289 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000291PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000293#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 /* This is actually an illegal character, so it should
297 not be passed to unichr. */
298 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299#endif
300}
301
Victor Stinner910337b2011-10-03 03:20:16 +0200302#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200303int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100304_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200305{
306 PyASCIIObject *ascii;
307 unsigned int kind;
308
309 assert(PyUnicode_Check(op));
310
311 ascii = (PyASCIIObject *)op;
312 kind = ascii->state.kind;
313
Victor Stinnera3b334d2011-10-03 13:53:37 +0200314 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(ascii->state.ready == 1);
317 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200319 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200320 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200321
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 if (ascii->state.compact == 1) {
323 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200324 assert(kind == PyUnicode_1BYTE_KIND
325 || kind == PyUnicode_2BYTE_KIND
326 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 }
331 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
333
334 data = unicode->data.any;
335 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 assert(ascii->length == 0);
337 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.compact == 0);
339 assert(ascii->state.ascii == 0);
340 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 }
346 else {
347 assert(kind == PyUnicode_1BYTE_KIND
348 || kind == PyUnicode_2BYTE_KIND
349 || kind == PyUnicode_4BYTE_KIND);
350 assert(ascii->state.compact == 0);
351 assert(ascii->state.ready == 1);
352 assert(data != NULL);
353 if (ascii->state.ascii) {
354 assert (compact->utf8 == data);
355 assert (compact->utf8_length == ascii->length);
356 }
357 else
358 assert (compact->utf8 != data);
359 }
360 }
361 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200362 if (
363#if SIZEOF_WCHAR_T == 2
364 kind == PyUnicode_2BYTE_KIND
365#else
366 kind == PyUnicode_4BYTE_KIND
367#endif
368 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200369 {
370 assert(ascii->wstr == data);
371 assert(compact->wstr_length == ascii->length);
372 } else
373 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200375
376 if (compact->utf8 == NULL)
377 assert(compact->utf8_length == 0);
378 if (ascii->wstr == NULL)
379 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 /* check that the best kind is used */
382 if (check_content && kind != PyUnicode_WCHAR_KIND)
383 {
384 Py_ssize_t i;
385 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200386 void *data;
387 Py_UCS4 ch;
388
389 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 for (i=0; i < ascii->length; i++)
391 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 if (ch > maxchar)
394 maxchar = ch;
395 }
396 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100397 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 assert(maxchar <= 255);
400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 else
402 assert(maxchar < 128);
403 }
Victor Stinner77faf692011-11-20 18:56:05 +0100404 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 0xFFFF);
407 }
408 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100410 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200412 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200413 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400414 return 1;
415}
Victor Stinner910337b2011-10-03 03:20:16 +0200416#endif
417
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100418static PyObject*
419unicode_result_wchar(PyObject *unicode)
420{
421#ifndef Py_DEBUG
422 Py_ssize_t len;
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100426 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200427 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100432 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200440 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 return NULL;
442 }
443#else
Victor Stinneraa771272012-10-04 02:32:58 +0200444 assert(Py_REFCNT(unicode) == 1);
445
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 /* don't make the result ready in debug mode to ensure that the caller
447 makes the string ready before using it */
448 assert(_PyUnicode_CheckConsistency(unicode, 1));
449#endif
450 return unicode;
451}
452
453static PyObject*
454unicode_result_ready(PyObject *unicode)
455{
456 Py_ssize_t length;
457
458 length = PyUnicode_GET_LENGTH(unicode);
459 if (length == 0) {
460 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100461 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200462 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100463 }
464 return unicode_empty;
465 }
466
467 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200468 void *data = PyUnicode_DATA(unicode);
469 int kind = PyUnicode_KIND(unicode);
470 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100471 if (ch < 256) {
472 PyObject *latin1_char = unicode_latin1[ch];
473 if (latin1_char != NULL) {
474 if (unicode != latin1_char) {
475 Py_INCREF(latin1_char);
476 Py_DECREF(unicode);
477 }
478 return latin1_char;
479 }
480 else {
481 assert(_PyUnicode_CheckConsistency(unicode, 1));
482 Py_INCREF(unicode);
483 unicode_latin1[ch] = unicode;
484 return unicode;
485 }
486 }
487 }
488
489 assert(_PyUnicode_CheckConsistency(unicode, 1));
490 return unicode;
491}
492
493static PyObject*
494unicode_result(PyObject *unicode)
495{
496 assert(_PyUnicode_CHECK(unicode));
497 if (PyUnicode_IS_READY(unicode))
498 return unicode_result_ready(unicode);
499 else
500 return unicode_result_wchar(unicode);
501}
502
Victor Stinnerc4b49542011-12-11 22:44:26 +0100503static PyObject*
504unicode_result_unchanged(PyObject *unicode)
505{
506 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500507 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508 return NULL;
509 Py_INCREF(unicode);
510 return unicode;
511 }
512 else
513 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100514 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515}
516
Victor Stinner3a50e702011-10-18 21:21:00 +0200517#ifdef HAVE_MBCS
518static OSVERSIONINFOEX winver;
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521/* --- Bloom Filters ----------------------------------------------------- */
522
523/* stuff to implement simple "bloom filters" for Unicode characters.
524 to keep things simple, we use a single bitmask, using the least 5
525 bits from each unicode characters as the bit index. */
526
527/* the linebreak mask is set up by Unicode_Init below */
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#if LONG_BIT >= 128
530#define BLOOM_WIDTH 128
531#elif LONG_BIT >= 64
532#define BLOOM_WIDTH 64
533#elif LONG_BIT >= 32
534#define BLOOM_WIDTH 32
535#else
536#error "LONG_BIT is smaller than 32"
537#endif
538
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539#define BLOOM_MASK unsigned long
540
Serhiy Storchaka05997252013-01-26 12:14:02 +0200541static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542
Antoine Pitrouf068f942010-01-13 14:19:12 +0000543#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
Benjamin Peterson29060642009-01-31 22:14:21 +0000545#define BLOOM_LINEBREAK(ch) \
546 ((ch) < 128U ? ascii_linebreak[(ch)] : \
547 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
Alexander Belopolsky40018472011-02-26 01:02:56 +0000549Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551{
Victor Stinnera85af502013-04-09 21:53:54 +0200552#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
553 do { \
554 TYPE *data = (TYPE *)PTR; \
555 TYPE *end = data + LEN; \
556 Py_UCS4 ch; \
557 for (; data != end; data++) { \
558 ch = *data; \
559 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
560 } \
561 break; \
562 } while (0)
563
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564 /* calculate simple bloom-style bitmask for a given unicode string */
565
Antoine Pitrouf068f942010-01-13 14:19:12 +0000566 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567
568 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200569 switch (kind) {
570 case PyUnicode_1BYTE_KIND:
571 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
572 break;
573 case PyUnicode_2BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
575 break;
576 case PyUnicode_4BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
578 break;
579 default:
580 assert(0);
581 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000582 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200583
584#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585}
586
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200587/* Compilation of templated routines */
588
589#include "stringlib/asciilib.h"
590#include "stringlib/fastsearch.h"
591#include "stringlib/partition.h"
592#include "stringlib/split.h"
593#include "stringlib/count.h"
594#include "stringlib/find.h"
595#include "stringlib/find_max_char.h"
596#include "stringlib/localeutil.h"
597#include "stringlib/undef.h"
598
599#include "stringlib/ucs1lib.h"
600#include "stringlib/fastsearch.h"
601#include "stringlib/partition.h"
602#include "stringlib/split.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300605#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs2lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300616#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200617#include "stringlib/find_max_char.h"
618#include "stringlib/localeutil.h"
619#include "stringlib/undef.h"
620
621#include "stringlib/ucs4lib.h"
622#include "stringlib/fastsearch.h"
623#include "stringlib/partition.h"
624#include "stringlib/split.h"
625#include "stringlib/count.h"
626#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300627#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200628#include "stringlib/find_max_char.h"
629#include "stringlib/localeutil.h"
630#include "stringlib/undef.h"
631
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200632#include "stringlib/unicodedefs.h"
633#include "stringlib/fastsearch.h"
634#include "stringlib/count.h"
635#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100636#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638/* --- Unicode Object ----------------------------------------------------- */
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200641fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200643Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
644 Py_ssize_t size, Py_UCS4 ch,
645 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
648
649 switch (kind) {
650 case PyUnicode_1BYTE_KIND:
651 {
652 Py_UCS1 ch1 = (Py_UCS1) ch;
653 if (ch1 == ch)
654 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
655 else
656 return -1;
657 }
658 case PyUnicode_2BYTE_KIND:
659 {
660 Py_UCS2 ch2 = (Py_UCS2) ch;
661 if (ch2 == ch)
662 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
663 else
664 return -1;
665 }
666 case PyUnicode_4BYTE_KIND:
667 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
668 default:
669 assert(0);
670 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672}
673
Victor Stinnerafffce42012-10-03 23:03:17 +0200674#ifdef Py_DEBUG
675/* Fill the data of an Unicode string with invalid characters to detect bugs
676 earlier.
677
678 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
679 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
680 invalid character in Unicode 6.0. */
681static void
682unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
683{
684 int kind = PyUnicode_KIND(unicode);
685 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
686 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
687 if (length <= old_length)
688 return;
689 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
690}
691#endif
692
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693static PyObject*
694resize_compact(PyObject *unicode, Py_ssize_t length)
695{
696 Py_ssize_t char_size;
697 Py_ssize_t struct_size;
698 Py_ssize_t new_size;
699 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100700 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
703#endif
704
Victor Stinner79891572012-05-03 13:43:07 +0200705 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100707 assert(PyUnicode_IS_COMPACT(unicode));
708
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200709 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100710 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 struct_size = sizeof(PyASCIIObject);
712 else
713 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
717 PyErr_NoMemory();
718 return NULL;
719 }
720 new_size = (struct_size + (length + 1) * char_size);
721
Victor Stinner84def372011-12-11 20:04:56 +0100722 _Py_DEC_REFTOTAL;
723 _Py_ForgetReference(unicode);
724
725 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
726 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100727 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200728 PyErr_NoMemory();
729 return NULL;
730 }
Victor Stinner84def372011-12-11 20:04:56 +0100731 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100733
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100737 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 _PyUnicode_WSTR_LENGTH(unicode) = length;
739 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100740 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
741 PyObject_DEL(_PyUnicode_WSTR(unicode));
742 _PyUnicode_WSTR(unicode) = NULL;
743 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200744#ifdef Py_DEBUG
745 unicode_fill_invalid(unicode, old_length);
746#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
748 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 return unicode;
751}
752
Alexander Belopolsky40018472011-02-26 01:02:56 +0000753static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200754resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755{
Victor Stinner95663112011-10-04 01:03:50 +0200756 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000760
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 if (PyUnicode_IS_READY(unicode)) {
762 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200765#ifdef Py_DEBUG
766 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
767#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200770 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200771 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
772 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
775 PyErr_NoMemory();
776 return -1;
777 }
778 new_size = (length + 1) * char_size;
779
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
781 {
782 PyObject_DEL(_PyUnicode_UTF8(unicode));
783 _PyUnicode_UTF8(unicode) = NULL;
784 _PyUnicode_UTF8_LENGTH(unicode) = 0;
785 }
786
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 data = (PyObject *)PyObject_REALLOC(data, new_size);
788 if (data == NULL) {
789 PyErr_NoMemory();
790 return -1;
791 }
792 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200793 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200795 _PyUnicode_WSTR_LENGTH(unicode) = length;
796 }
797 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200798 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200799 _PyUnicode_UTF8_LENGTH(unicode) = length;
800 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200801 _PyUnicode_LENGTH(unicode) = length;
802 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200803#ifdef Py_DEBUG
804 unicode_fill_invalid(unicode, old_length);
805#endif
Victor Stinner95663112011-10-04 01:03:50 +0200806 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200807 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200809 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200810 }
Victor Stinner95663112011-10-04 01:03:50 +0200811 assert(_PyUnicode_WSTR(unicode) != NULL);
812
813 /* check for integer overflow */
814 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
815 PyErr_NoMemory();
816 return -1;
817 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100818 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200819 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100820 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200821 if (!wstr) {
822 PyErr_NoMemory();
823 return -1;
824 }
825 _PyUnicode_WSTR(unicode) = wstr;
826 _PyUnicode_WSTR(unicode)[length] = 0;
827 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200828 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 return 0;
830}
831
Victor Stinnerfe226c02011-10-03 03:52:20 +0200832static PyObject*
833resize_copy(PyObject *unicode, Py_ssize_t length)
834{
835 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100836 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100838
Benjamin Petersonbac79492012-01-14 13:34:47 -0500839 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100840 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200841
842 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
843 if (copy == NULL)
844 return NULL;
845
846 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200847 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200848 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200849 }
850 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200851 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100852
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200853 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200854 if (w == NULL)
855 return NULL;
856 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
857 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200858 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
859 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200860 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200861 }
862}
863
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000865 Ux0000 terminated; some code (e.g. new_identifier)
866 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
868 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000869 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871*/
872
Alexander Belopolsky40018472011-02-26 01:02:56 +0000873static PyUnicodeObject *
874_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200876 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880 if (length == 0 && unicode_empty != NULL) {
881 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200882 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 }
884
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000885 /* Ensure we won't overflow the size. */
886 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
887 return (PyUnicodeObject *)PyErr_NoMemory();
888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 if (length < 0) {
890 PyErr_SetString(PyExc_SystemError,
891 "Negative size passed to _PyUnicode_New");
892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
896 if (unicode == NULL)
897 return NULL;
898 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
899 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
900 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100901 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000902 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100903 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905
Jeremy Hyltond8082792003-09-16 19:41:39 +0000906 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000907 * the caller fails before initializing str -- unicode_resize()
908 * reads str[0], and the Keep-Alive optimization can keep memory
909 * allocated for str alive across a call to unicode_dealloc(unicode).
910 * We don't want unicode_resize to read uninitialized memory in
911 * that case.
912 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913 _PyUnicode_WSTR(unicode)[0] = 0;
914 _PyUnicode_WSTR(unicode)[length] = 0;
915 _PyUnicode_WSTR_LENGTH(unicode) = length;
916 _PyUnicode_HASH(unicode) = -1;
917 _PyUnicode_STATE(unicode).interned = 0;
918 _PyUnicode_STATE(unicode).kind = 0;
919 _PyUnicode_STATE(unicode).compact = 0;
920 _PyUnicode_STATE(unicode).ready = 0;
921 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200922 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200924 _PyUnicode_UTF8(unicode) = NULL;
925 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100926 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000927 return unicode;
928}
929
Victor Stinnerf42dc442011-10-02 23:33:16 +0200930static const char*
931unicode_kind_name(PyObject *unicode)
932{
Victor Stinner42dfd712011-10-03 14:41:45 +0200933 /* don't check consistency: unicode_kind_name() is called from
934 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200935 if (!PyUnicode_IS_COMPACT(unicode))
936 {
937 if (!PyUnicode_IS_READY(unicode))
938 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600939 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200940 {
941 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200942 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 return "legacy ascii";
944 else
945 return "legacy latin1";
946 case PyUnicode_2BYTE_KIND:
947 return "legacy UCS2";
948 case PyUnicode_4BYTE_KIND:
949 return "legacy UCS4";
950 default:
951 return "<legacy invalid kind>";
952 }
953 }
954 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600955 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200956 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200957 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200958 return "ascii";
959 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200960 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200961 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200962 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200963 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200965 default:
966 return "<invalid compact kind>";
967 }
968}
969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971/* Functions wrapping macros for use in debugger */
972char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200973 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974}
975
976void *_PyUnicode_compact_data(void *unicode) {
977 return _PyUnicode_COMPACT_DATA(unicode);
978}
979void *_PyUnicode_data(void *unicode){
980 printf("obj %p\n", unicode);
981 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
982 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
983 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
984 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
985 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
986 return PyUnicode_DATA(unicode);
987}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200988
989void
990_PyUnicode_Dump(PyObject *op)
991{
992 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200993 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
994 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
995 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200996
Victor Stinnera849a4b2011-10-03 12:12:11 +0200997 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200998 {
999 if (ascii->state.ascii)
1000 data = (ascii + 1);
1001 else
1002 data = (compact + 1);
1003 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001004 else
1005 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1007
Victor Stinnera849a4b2011-10-03 12:12:11 +02001008 if (ascii->wstr == data)
1009 printf("shared ");
1010 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001011
Victor Stinnera3b334d2011-10-03 13:53:37 +02001012 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001013 printf(" (%zu), ", compact->wstr_length);
1014 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1015 printf("shared ");
1016 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001017 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001018 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001019}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001020#endif
1021
1022PyObject *
1023PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1024{
1025 PyObject *obj;
1026 PyCompactUnicodeObject *unicode;
1027 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001028 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001029 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 Py_ssize_t char_size;
1031 Py_ssize_t struct_size;
1032
1033 /* Optimization for empty strings */
1034 if (size == 0 && unicode_empty != NULL) {
1035 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001036 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 }
1038
Victor Stinner9e9d6892011-10-04 01:02:02 +02001039 is_ascii = 0;
1040 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 struct_size = sizeof(PyCompactUnicodeObject);
1042 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001043 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 char_size = 1;
1045 is_ascii = 1;
1046 struct_size = sizeof(PyASCIIObject);
1047 }
1048 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001049 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 char_size = 1;
1051 }
1052 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001053 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 char_size = 2;
1055 if (sizeof(wchar_t) == 2)
1056 is_sharing = 1;
1057 }
1058 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001059 if (maxchar > MAX_UNICODE) {
1060 PyErr_SetString(PyExc_SystemError,
1061 "invalid maximum character passed to PyUnicode_New");
1062 return NULL;
1063 }
Victor Stinner8f825062012-04-27 13:55:39 +02001064 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 char_size = 4;
1066 if (sizeof(wchar_t) == 4)
1067 is_sharing = 1;
1068 }
1069
1070 /* Ensure we won't overflow the size. */
1071 if (size < 0) {
1072 PyErr_SetString(PyExc_SystemError,
1073 "Negative size passed to PyUnicode_New");
1074 return NULL;
1075 }
1076 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1077 return PyErr_NoMemory();
1078
1079 /* Duplicated allocation code from _PyObject_New() instead of a call to
1080 * PyObject_New() so we are able to allocate space for the object and
1081 * it's data buffer.
1082 */
1083 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1084 if (obj == NULL)
1085 return PyErr_NoMemory();
1086 obj = PyObject_INIT(obj, &PyUnicode_Type);
1087 if (obj == NULL)
1088 return NULL;
1089
1090 unicode = (PyCompactUnicodeObject *)obj;
1091 if (is_ascii)
1092 data = ((PyASCIIObject*)obj) + 1;
1093 else
1094 data = unicode + 1;
1095 _PyUnicode_LENGTH(unicode) = size;
1096 _PyUnicode_HASH(unicode) = -1;
1097 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001098 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099 _PyUnicode_STATE(unicode).compact = 1;
1100 _PyUnicode_STATE(unicode).ready = 1;
1101 _PyUnicode_STATE(unicode).ascii = is_ascii;
1102 if (is_ascii) {
1103 ((char*)data)[size] = 0;
1104 _PyUnicode_WSTR(unicode) = NULL;
1105 }
Victor Stinner8f825062012-04-27 13:55:39 +02001106 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 ((char*)data)[size] = 0;
1108 _PyUnicode_WSTR(unicode) = NULL;
1109 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001111 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 else {
1114 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001115 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001116 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001118 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 ((Py_UCS4*)data)[size] = 0;
1120 if (is_sharing) {
1121 _PyUnicode_WSTR_LENGTH(unicode) = size;
1122 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1123 }
1124 else {
1125 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1126 _PyUnicode_WSTR(unicode) = NULL;
1127 }
1128 }
Victor Stinner8f825062012-04-27 13:55:39 +02001129#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001130 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001131#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001132 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133 return obj;
1134}
1135
1136#if SIZEOF_WCHAR_T == 2
1137/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1138 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001139 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140
1141 This function assumes that unicode can hold one more code point than wstr
1142 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001143static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001145 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
1147 const wchar_t *iter;
1148 Py_UCS4 *ucs4_out;
1149
Victor Stinner910337b2011-10-03 03:20:16 +02001150 assert(unicode != NULL);
1151 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1153 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1154
1155 for (iter = begin; iter < end; ) {
1156 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1157 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001158 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1159 && (iter+1) < end
1160 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161 {
Victor Stinner551ac952011-11-29 22:58:13 +01001162 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 iter += 2;
1164 }
1165 else {
1166 *ucs4_out++ = *iter;
1167 iter++;
1168 }
1169 }
1170 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1171 _PyUnicode_GET_LENGTH(unicode)));
1172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173}
1174#endif
1175
Victor Stinnercd9950f2011-10-02 00:34:53 +02001176static int
Victor Stinner488fa492011-12-12 00:01:39 +01001177unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001178{
Victor Stinner488fa492011-12-12 00:01:39 +01001179 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001180 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001181 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001182 return -1;
1183 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184 return 0;
1185}
1186
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001187static int
1188_copy_characters(PyObject *to, Py_ssize_t to_start,
1189 PyObject *from, Py_ssize_t from_start,
1190 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001192 unsigned int from_kind, to_kind;
1193 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194
Victor Stinneree4544c2012-05-09 22:24:08 +02001195 assert(0 <= how_many);
1196 assert(0 <= from_start);
1197 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001198 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001199 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001200 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201
Victor Stinnerd3f08822012-05-29 12:57:52 +02001202 assert(PyUnicode_Check(to));
1203 assert(PyUnicode_IS_READY(to));
1204 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1205
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001206 if (how_many == 0)
1207 return 0;
1208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001212 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213
Victor Stinnerf1852262012-06-16 16:38:26 +02001214#ifdef Py_DEBUG
1215 if (!check_maxchar
1216 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1217 {
1218 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1219 Py_UCS4 ch;
1220 Py_ssize_t i;
1221 for (i=0; i < how_many; i++) {
1222 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1223 assert(ch <= to_maxchar);
1224 }
1225 }
1226#endif
1227
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001228 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001229 if (check_maxchar
1230 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1231 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 /* Writing Latin-1 characters into an ASCII string requires to
1233 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001234 Py_UCS4 max_char;
1235 max_char = ucs1lib_find_max_char(from_data,
1236 (Py_UCS1*)from_data + how_many);
1237 if (max_char >= 128)
1238 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001239 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001240 Py_MEMCPY((char*)to_data + to_kind * to_start,
1241 (char*)from_data + from_kind * from_start,
1242 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001244 else if (from_kind == PyUnicode_1BYTE_KIND
1245 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001246 {
1247 _PyUnicode_CONVERT_BYTES(
1248 Py_UCS1, Py_UCS2,
1249 PyUnicode_1BYTE_DATA(from) + from_start,
1250 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1251 PyUnicode_2BYTE_DATA(to) + to_start
1252 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001253 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001254 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001255 && to_kind == PyUnicode_4BYTE_KIND)
1256 {
1257 _PyUnicode_CONVERT_BYTES(
1258 Py_UCS1, Py_UCS4,
1259 PyUnicode_1BYTE_DATA(from) + from_start,
1260 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1261 PyUnicode_4BYTE_DATA(to) + to_start
1262 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 }
1264 else if (from_kind == PyUnicode_2BYTE_KIND
1265 && to_kind == PyUnicode_4BYTE_KIND)
1266 {
1267 _PyUnicode_CONVERT_BYTES(
1268 Py_UCS2, Py_UCS4,
1269 PyUnicode_2BYTE_DATA(from) + from_start,
1270 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1271 PyUnicode_4BYTE_DATA(to) + to_start
1272 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001273 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001274 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001275 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1276
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001277 if (!check_maxchar) {
1278 if (from_kind == PyUnicode_2BYTE_KIND
1279 && to_kind == PyUnicode_1BYTE_KIND)
1280 {
1281 _PyUnicode_CONVERT_BYTES(
1282 Py_UCS2, Py_UCS1,
1283 PyUnicode_2BYTE_DATA(from) + from_start,
1284 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1285 PyUnicode_1BYTE_DATA(to) + to_start
1286 );
1287 }
1288 else if (from_kind == PyUnicode_4BYTE_KIND
1289 && to_kind == PyUnicode_1BYTE_KIND)
1290 {
1291 _PyUnicode_CONVERT_BYTES(
1292 Py_UCS4, Py_UCS1,
1293 PyUnicode_4BYTE_DATA(from) + from_start,
1294 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1295 PyUnicode_1BYTE_DATA(to) + to_start
1296 );
1297 }
1298 else if (from_kind == PyUnicode_4BYTE_KIND
1299 && to_kind == PyUnicode_2BYTE_KIND)
1300 {
1301 _PyUnicode_CONVERT_BYTES(
1302 Py_UCS4, Py_UCS2,
1303 PyUnicode_4BYTE_DATA(from) + from_start,
1304 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1305 PyUnicode_2BYTE_DATA(to) + to_start
1306 );
1307 }
1308 else {
1309 assert(0);
1310 return -1;
1311 }
1312 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001313 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001314 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001315 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001316 Py_ssize_t i;
1317
Victor Stinnera0702ab2011-09-29 14:14:38 +02001318 for (i=0; i < how_many; i++) {
1319 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001320 if (ch > to_maxchar)
1321 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1323 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 }
1325 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001326 return 0;
1327}
1328
Victor Stinnerd3f08822012-05-29 12:57:52 +02001329void
1330_PyUnicode_FastCopyCharacters(
1331 PyObject *to, Py_ssize_t to_start,
1332 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001333{
1334 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1335}
1336
1337Py_ssize_t
1338PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1339 PyObject *from, Py_ssize_t from_start,
1340 Py_ssize_t how_many)
1341{
1342 int err;
1343
1344 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1345 PyErr_BadInternalCall();
1346 return -1;
1347 }
1348
Benjamin Petersonbac79492012-01-14 13:34:47 -05001349 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001350 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001351 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001352 return -1;
1353
Victor Stinnerd3f08822012-05-29 12:57:52 +02001354 if (from_start < 0) {
1355 PyErr_SetString(PyExc_IndexError, "string index out of range");
1356 return -1;
1357 }
1358 if (to_start < 0) {
1359 PyErr_SetString(PyExc_IndexError, "string index out of range");
1360 return -1;
1361 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001362 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1363 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1364 PyErr_Format(PyExc_SystemError,
1365 "Cannot write %zi characters at %zi "
1366 "in a string of %zi characters",
1367 how_many, to_start, PyUnicode_GET_LENGTH(to));
1368 return -1;
1369 }
1370
1371 if (how_many == 0)
1372 return 0;
1373
Victor Stinner488fa492011-12-12 00:01:39 +01001374 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001375 return -1;
1376
1377 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1378 if (err) {
1379 PyErr_Format(PyExc_SystemError,
1380 "Cannot copy %s characters "
1381 "into a string of %s characters",
1382 unicode_kind_name(from),
1383 unicode_kind_name(to));
1384 return -1;
1385 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001386 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387}
1388
Victor Stinner17222162011-09-28 22:15:37 +02001389/* Find the maximum code point and count the number of surrogate pairs so a
1390 correct string length can be computed before converting a string to UCS4.
1391 This function counts single surrogates as a character and not as a pair.
1392
1393 Return 0 on success, or -1 on error. */
1394static int
1395find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1396 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397{
1398 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001399 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400
Victor Stinnerc53be962011-10-02 21:33:54 +02001401 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 *num_surrogates = 0;
1403 *maxchar = 0;
1404
1405 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001407 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1408 && (iter+1) < end
1409 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1410 {
1411 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1412 ++(*num_surrogates);
1413 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 }
1415 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001417 {
1418 ch = *iter;
1419 iter++;
1420 }
1421 if (ch > *maxchar) {
1422 *maxchar = ch;
1423 if (*maxchar > MAX_UNICODE) {
1424 PyErr_Format(PyExc_ValueError,
1425 "character U+%x is not in range [U+0000; U+10ffff]",
1426 ch);
1427 return -1;
1428 }
1429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 }
1431 return 0;
1432}
1433
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001434int
1435_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436{
1437 wchar_t *end;
1438 Py_UCS4 maxchar = 0;
1439 Py_ssize_t num_surrogates;
1440#if SIZEOF_WCHAR_T == 2
1441 Py_ssize_t length_wo_surrogates;
1442#endif
1443
Georg Brandl7597add2011-10-05 16:36:47 +02001444 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001445 strings were created using _PyObject_New() and where no canonical
1446 representation (the str field) has been set yet aka strings
1447 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001448 assert(_PyUnicode_CHECK(unicode));
1449 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001451 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 /* Actually, it should neither be interned nor be anything else: */
1454 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001457 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001458 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1463 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 PyErr_NoMemory();
1465 return -1;
1466 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001467 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 _PyUnicode_WSTR(unicode), end,
1469 PyUnicode_1BYTE_DATA(unicode));
1470 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1471 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1472 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1473 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001474 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001475 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001476 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 }
1478 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001479 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001480 _PyUnicode_UTF8(unicode) = NULL;
1481 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 }
1483 PyObject_FREE(_PyUnicode_WSTR(unicode));
1484 _PyUnicode_WSTR(unicode) = NULL;
1485 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1486 }
1487 /* In this case we might have to convert down from 4-byte native
1488 wchar_t to 2-byte unicode. */
1489 else if (maxchar < 65536) {
1490 assert(num_surrogates == 0 &&
1491 "FindMaxCharAndNumSurrogatePairs() messed up");
1492
Victor Stinner506f5922011-09-28 22:34:18 +02001493#if SIZEOF_WCHAR_T == 2
1494 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001495 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001496 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1497 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1498 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001499 _PyUnicode_UTF8(unicode) = NULL;
1500 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001501#else
1502 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001504 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001505 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001506 PyErr_NoMemory();
1507 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001508 }
Victor Stinner506f5922011-09-28 22:34:18 +02001509 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1510 _PyUnicode_WSTR(unicode), end,
1511 PyUnicode_2BYTE_DATA(unicode));
1512 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1513 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1514 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001515 _PyUnicode_UTF8(unicode) = NULL;
1516 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001517 PyObject_FREE(_PyUnicode_WSTR(unicode));
1518 _PyUnicode_WSTR(unicode) = NULL;
1519 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1520#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 }
1522 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1523 else {
1524#if SIZEOF_WCHAR_T == 2
1525 /* in case the native representation is 2-bytes, we need to allocate a
1526 new normalized 4-byte version. */
1527 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001528 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1529 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001530 PyErr_NoMemory();
1531 return -1;
1532 }
1533 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1534 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001535 _PyUnicode_UTF8(unicode) = NULL;
1536 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001537 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1538 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001539 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 PyObject_FREE(_PyUnicode_WSTR(unicode));
1541 _PyUnicode_WSTR(unicode) = NULL;
1542 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1543#else
1544 assert(num_surrogates == 0);
1545
Victor Stinnerc3c74152011-10-02 20:39:55 +02001546 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001548 _PyUnicode_UTF8(unicode) = NULL;
1549 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1551#endif
1552 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1553 }
1554 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001555 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001556 return 0;
1557}
1558
Alexander Belopolsky40018472011-02-26 01:02:56 +00001559static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001560unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561{
Walter Dörwald16807132007-05-25 13:52:07 +00001562 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001563 case SSTATE_NOT_INTERNED:
1564 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001565
Benjamin Peterson29060642009-01-31 22:14:21 +00001566 case SSTATE_INTERNED_MORTAL:
1567 /* revive dead object temporarily for DelItem */
1568 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001569 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 Py_FatalError(
1571 "deletion of interned string failed");
1572 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 case SSTATE_INTERNED_IMMORTAL:
1575 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001576
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 default:
1578 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001579 }
1580
Victor Stinner03490912011-10-03 23:45:12 +02001581 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001582 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001583 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001584 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001585 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1586 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001588 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589}
1590
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001591#ifdef Py_DEBUG
1592static int
1593unicode_is_singleton(PyObject *unicode)
1594{
1595 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1596 if (unicode == unicode_empty)
1597 return 1;
1598 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1599 {
1600 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1601 if (ch < 256 && unicode_latin1[ch] == unicode)
1602 return 1;
1603 }
1604 return 0;
1605}
1606#endif
1607
Alexander Belopolsky40018472011-02-26 01:02:56 +00001608static int
Victor Stinner488fa492011-12-12 00:01:39 +01001609unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001610{
Victor Stinner488fa492011-12-12 00:01:39 +01001611 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 if (Py_REFCNT(unicode) != 1)
1613 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001614 if (_PyUnicode_HASH(unicode) != -1)
1615 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001616 if (PyUnicode_CHECK_INTERNED(unicode))
1617 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001618 if (!PyUnicode_CheckExact(unicode))
1619 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001620#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001621 /* singleton refcount is greater than 1 */
1622 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001623#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 return 1;
1625}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001626
Victor Stinnerfe226c02011-10-03 03:52:20 +02001627static int
1628unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1629{
1630 PyObject *unicode;
1631 Py_ssize_t old_length;
1632
1633 assert(p_unicode != NULL);
1634 unicode = *p_unicode;
1635
1636 assert(unicode != NULL);
1637 assert(PyUnicode_Check(unicode));
1638 assert(0 <= length);
1639
Victor Stinner910337b2011-10-03 03:20:16 +02001640 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 old_length = PyUnicode_WSTR_LENGTH(unicode);
1642 else
1643 old_length = PyUnicode_GET_LENGTH(unicode);
1644 if (old_length == length)
1645 return 0;
1646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001648 _Py_INCREF_UNICODE_EMPTY();
1649 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001650 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001651 Py_DECREF(*p_unicode);
1652 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001653 return 0;
1654 }
1655
Victor Stinner488fa492011-12-12 00:01:39 +01001656 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001657 PyObject *copy = resize_copy(unicode, length);
1658 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001659 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001660 Py_DECREF(*p_unicode);
1661 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001663 }
1664
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001666 PyObject *new_unicode = resize_compact(unicode, length);
1667 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001669 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001670 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001671 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001672 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001673}
1674
Alexander Belopolsky40018472011-02-26 01:02:56 +00001675int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001677{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 PyObject *unicode;
1679 if (p_unicode == NULL) {
1680 PyErr_BadInternalCall();
1681 return -1;
1682 }
1683 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001684 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001685 {
1686 PyErr_BadInternalCall();
1687 return -1;
1688 }
1689 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001690}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001691
Victor Stinnerc5166102012-02-22 13:55:02 +01001692/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001693
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001694 WARNING: The function doesn't copy the terminating null character and
1695 doesn't check the maximum character (may write a latin1 character in an
1696 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001697static void
1698unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1699 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001700{
1701 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1702 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001703 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001704
1705 switch (kind) {
1706 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001707 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001708#ifdef Py_DEBUG
1709 if (PyUnicode_IS_ASCII(unicode)) {
1710 Py_UCS4 maxchar = ucs1lib_find_max_char(
1711 (const Py_UCS1*)str,
1712 (const Py_UCS1*)str + len);
1713 assert(maxchar < 128);
1714 }
1715#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001716 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001717 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001718 }
1719 case PyUnicode_2BYTE_KIND: {
1720 Py_UCS2 *start = (Py_UCS2 *)data + index;
1721 Py_UCS2 *ucs2 = start;
1722 assert(index <= PyUnicode_GET_LENGTH(unicode));
1723
Victor Stinner184252a2012-06-16 02:57:41 +02001724 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001725 *ucs2 = (Py_UCS2)*str;
1726
1727 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001728 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001729 }
1730 default: {
1731 Py_UCS4 *start = (Py_UCS4 *)data + index;
1732 Py_UCS4 *ucs4 = start;
1733 assert(kind == PyUnicode_4BYTE_KIND);
1734 assert(index <= PyUnicode_GET_LENGTH(unicode));
1735
Victor Stinner184252a2012-06-16 02:57:41 +02001736 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 *ucs4 = (Py_UCS4)*str;
1738
1739 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001740 }
1741 }
1742}
1743
1744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745static PyObject*
1746get_latin1_char(unsigned char ch)
1747{
Victor Stinnera464fc12011-10-02 20:39:30 +02001748 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001750 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 if (!unicode)
1752 return NULL;
1753 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001754 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 unicode_latin1[ch] = unicode;
1756 }
1757 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001758 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759}
1760
Alexander Belopolsky40018472011-02-26 01:02:56 +00001761PyObject *
1762PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001764 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 Py_UCS4 maxchar = 0;
1766 Py_ssize_t num_surrogates;
1767
1768 if (u == NULL)
1769 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001771 /* If the Unicode data is known at construction time, we can apply
1772 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001775 if (size == 0)
1776 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 /* Single character Unicode objects in the Latin-1 range are
1779 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001780 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 return get_latin1_char((unsigned char)*u);
1782
1783 /* If not empty and not single character, copy the Unicode data
1784 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001785 if (find_maxchar_surrogates(u, u + size,
1786 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 return NULL;
1788
Victor Stinner8faf8212011-12-08 22:14:11 +01001789 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 if (!unicode)
1791 return NULL;
1792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 switch (PyUnicode_KIND(unicode)) {
1794 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001795 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1797 break;
1798 case PyUnicode_2BYTE_KIND:
1799#if Py_UNICODE_SIZE == 2
1800 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1801#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001802 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1804#endif
1805 break;
1806 case PyUnicode_4BYTE_KIND:
1807#if SIZEOF_WCHAR_T == 2
1808 /* This is the only case which has to process surrogates, thus
1809 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001810 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811#else
1812 assert(num_surrogates == 0);
1813 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1814#endif
1815 break;
1816 default:
1817 assert(0 && "Impossible state");
1818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821}
1822
Alexander Belopolsky40018472011-02-26 01:02:56 +00001823PyObject *
1824PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001825{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 if (size < 0) {
1827 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001828 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001829 return NULL;
1830 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001831 if (u != NULL)
1832 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1833 else
1834 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001835}
1836
Alexander Belopolsky40018472011-02-26 01:02:56 +00001837PyObject *
1838PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001839{
1840 size_t size = strlen(u);
1841 if (size > PY_SSIZE_T_MAX) {
1842 PyErr_SetString(PyExc_OverflowError, "input too long");
1843 return NULL;
1844 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001845 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001846}
1847
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001848PyObject *
1849_PyUnicode_FromId(_Py_Identifier *id)
1850{
1851 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001852 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1853 strlen(id->string),
1854 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855 if (!id->object)
1856 return NULL;
1857 PyUnicode_InternInPlace(&id->object);
1858 assert(!id->next);
1859 id->next = static_strings;
1860 static_strings = id;
1861 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001862 return id->object;
1863}
1864
1865void
1866_PyUnicode_ClearStaticStrings()
1867{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001868 _Py_Identifier *tmp, *s = static_strings;
1869 while (s) {
1870 Py_DECREF(s->object);
1871 s->object = NULL;
1872 tmp = s->next;
1873 s->next = NULL;
1874 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001875 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001876 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001877}
1878
Benjamin Peterson0df54292012-03-26 14:50:32 -04001879/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001880
Victor Stinnerd3f08822012-05-29 12:57:52 +02001881PyObject*
1882_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001883{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001884 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001885 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001886 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001887#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001888 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001889#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001890 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001891 }
Victor Stinner785938e2011-12-11 20:09:03 +01001892 unicode = PyUnicode_New(size, 127);
1893 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001894 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001895 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1896 assert(_PyUnicode_CheckConsistency(unicode, 1));
1897 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001898}
1899
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001900static Py_UCS4
1901kind_maxchar_limit(unsigned int kind)
1902{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001903 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001904 case PyUnicode_1BYTE_KIND:
1905 return 0x80;
1906 case PyUnicode_2BYTE_KIND:
1907 return 0x100;
1908 case PyUnicode_4BYTE_KIND:
1909 return 0x10000;
1910 default:
1911 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001912 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001913 }
1914}
1915
Victor Stinnere6abb482012-05-02 01:15:40 +02001916Py_LOCAL_INLINE(Py_UCS4)
1917align_maxchar(Py_UCS4 maxchar)
1918{
1919 if (maxchar <= 127)
1920 return 127;
1921 else if (maxchar <= 255)
1922 return 255;
1923 else if (maxchar <= 65535)
1924 return 65535;
1925 else
1926 return MAX_UNICODE;
1927}
1928
Victor Stinner702c7342011-10-05 13:50:52 +02001929static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001930_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001933 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001934
Serhiy Storchaka678db842013-01-26 12:16:36 +02001935 if (size == 0)
1936 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001937 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001938 if (size == 1)
1939 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001940
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001941 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001942 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 if (!res)
1944 return NULL;
1945 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001946 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001948}
1949
Victor Stinnere57b1c02011-09-28 22:20:48 +02001950static PyObject*
1951_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952{
1953 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001954 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001955
Serhiy Storchaka678db842013-01-26 12:16:36 +02001956 if (size == 0)
1957 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001958 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001959 if (size == 1) {
1960 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001961 int kind;
1962 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001963 if (ch < 256)
1964 return get_latin1_char((unsigned char)ch);
1965
1966 res = PyUnicode_New(1, ch);
1967 if (res == NULL)
1968 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001969 kind = PyUnicode_KIND(res);
1970 data = PyUnicode_DATA(res);
1971 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001972 assert(_PyUnicode_CheckConsistency(res, 1));
1973 return res;
1974 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001977 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (!res)
1979 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001982 else {
1983 _PyUnicode_CONVERT_BYTES(
1984 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1985 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001986 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 return res;
1988}
1989
Victor Stinnere57b1c02011-09-28 22:20:48 +02001990static PyObject*
1991_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992{
1993 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001994 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001995
Serhiy Storchaka678db842013-01-26 12:16:36 +02001996 if (size == 0)
1997 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001998 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001999 if (size == 1) {
2000 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002001 int kind;
2002 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002003 if (ch < 256)
2004 return get_latin1_char((unsigned char)ch);
2005
2006 res = PyUnicode_New(1, ch);
2007 if (res == NULL)
2008 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002009 kind = PyUnicode_KIND(res);
2010 data = PyUnicode_DATA(res);
2011 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002012 assert(_PyUnicode_CheckConsistency(res, 1));
2013 return res;
2014 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002015
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002016 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002017 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 if (!res)
2019 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002020 if (max_char < 256)
2021 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2022 PyUnicode_1BYTE_DATA(res));
2023 else if (max_char < 0x10000)
2024 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2025 PyUnicode_2BYTE_DATA(res));
2026 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002028 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 return res;
2030}
2031
2032PyObject*
2033PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2034{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002035 if (size < 0) {
2036 PyErr_SetString(PyExc_ValueError, "size must be positive");
2037 return NULL;
2038 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002039 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002041 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002043 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002045 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002046 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002047 PyErr_SetString(PyExc_SystemError, "invalid kind");
2048 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050}
2051
Victor Stinnerece58de2012-04-23 23:36:38 +02002052Py_UCS4
2053_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2054{
2055 enum PyUnicode_Kind kind;
2056 void *startptr, *endptr;
2057
2058 assert(PyUnicode_IS_READY(unicode));
2059 assert(0 <= start);
2060 assert(end <= PyUnicode_GET_LENGTH(unicode));
2061 assert(start <= end);
2062
2063 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2064 return PyUnicode_MAX_CHAR_VALUE(unicode);
2065
2066 if (start == end)
2067 return 127;
2068
Victor Stinner94d558b2012-04-27 22:26:58 +02002069 if (PyUnicode_IS_ASCII(unicode))
2070 return 127;
2071
Victor Stinnerece58de2012-04-23 23:36:38 +02002072 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002073 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002074 endptr = (char *)startptr + end * kind;
2075 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002076 switch(kind) {
2077 case PyUnicode_1BYTE_KIND:
2078 return ucs1lib_find_max_char(startptr, endptr);
2079 case PyUnicode_2BYTE_KIND:
2080 return ucs2lib_find_max_char(startptr, endptr);
2081 case PyUnicode_4BYTE_KIND:
2082 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002083 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002084 assert(0);
2085 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002086 }
2087}
2088
Victor Stinner25a4b292011-10-06 12:31:55 +02002089/* Ensure that a string uses the most efficient storage, if it is not the
2090 case: create a new string with of the right kind. Write NULL into *p_unicode
2091 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002092static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002093unicode_adjust_maxchar(PyObject **p_unicode)
2094{
2095 PyObject *unicode, *copy;
2096 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002097 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002098 unsigned int kind;
2099
2100 assert(p_unicode != NULL);
2101 unicode = *p_unicode;
2102 assert(PyUnicode_IS_READY(unicode));
2103 if (PyUnicode_IS_ASCII(unicode))
2104 return;
2105
2106 len = PyUnicode_GET_LENGTH(unicode);
2107 kind = PyUnicode_KIND(unicode);
2108 if (kind == PyUnicode_1BYTE_KIND) {
2109 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002110 max_char = ucs1lib_find_max_char(u, u + len);
2111 if (max_char >= 128)
2112 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002113 }
2114 else if (kind == PyUnicode_2BYTE_KIND) {
2115 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002116 max_char = ucs2lib_find_max_char(u, u + len);
2117 if (max_char >= 256)
2118 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 }
2120 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002121 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002123 max_char = ucs4lib_find_max_char(u, u + len);
2124 if (max_char >= 0x10000)
2125 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002126 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002127 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002128 if (copy != NULL)
2129 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 Py_DECREF(unicode);
2131 *p_unicode = copy;
2132}
2133
Victor Stinner034f6cf2011-09-30 02:26:44 +02002134PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002135_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002136{
Victor Stinner87af4f22011-11-21 23:03:47 +01002137 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002138 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002139
Victor Stinner034f6cf2011-09-30 02:26:44 +02002140 if (!PyUnicode_Check(unicode)) {
2141 PyErr_BadInternalCall();
2142 return NULL;
2143 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002144 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002145 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146
Victor Stinner87af4f22011-11-21 23:03:47 +01002147 length = PyUnicode_GET_LENGTH(unicode);
2148 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149 if (!copy)
2150 return NULL;
2151 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2152
Victor Stinner87af4f22011-11-21 23:03:47 +01002153 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2154 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002155 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002157}
2158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159
Victor Stinnerbc603d12011-10-02 01:00:40 +02002160/* Widen Unicode objects to larger buffers. Don't write terminating null
2161 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162
2163void*
2164_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2165{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002166 Py_ssize_t len;
2167 void *result;
2168 unsigned int skind;
2169
Benjamin Petersonbac79492012-01-14 13:34:47 -05002170 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002171 return NULL;
2172
2173 len = PyUnicode_GET_LENGTH(s);
2174 skind = PyUnicode_KIND(s);
2175 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002176 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 return NULL;
2178 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002179 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002180 case PyUnicode_2BYTE_KIND:
2181 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2182 if (!result)
2183 return PyErr_NoMemory();
2184 assert(skind == PyUnicode_1BYTE_KIND);
2185 _PyUnicode_CONVERT_BYTES(
2186 Py_UCS1, Py_UCS2,
2187 PyUnicode_1BYTE_DATA(s),
2188 PyUnicode_1BYTE_DATA(s) + len,
2189 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002191 case PyUnicode_4BYTE_KIND:
2192 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2193 if (!result)
2194 return PyErr_NoMemory();
2195 if (skind == PyUnicode_2BYTE_KIND) {
2196 _PyUnicode_CONVERT_BYTES(
2197 Py_UCS2, Py_UCS4,
2198 PyUnicode_2BYTE_DATA(s),
2199 PyUnicode_2BYTE_DATA(s) + len,
2200 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002202 else {
2203 assert(skind == PyUnicode_1BYTE_KIND);
2204 _PyUnicode_CONVERT_BYTES(
2205 Py_UCS1, Py_UCS4,
2206 PyUnicode_1BYTE_DATA(s),
2207 PyUnicode_1BYTE_DATA(s) + len,
2208 result);
2209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002211 default:
2212 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 }
Victor Stinner01698042011-10-04 00:04:26 +02002214 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 return NULL;
2216}
2217
2218static Py_UCS4*
2219as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2220 int copy_null)
2221{
2222 int kind;
2223 void *data;
2224 Py_ssize_t len, targetlen;
2225 if (PyUnicode_READY(string) == -1)
2226 return NULL;
2227 kind = PyUnicode_KIND(string);
2228 data = PyUnicode_DATA(string);
2229 len = PyUnicode_GET_LENGTH(string);
2230 targetlen = len;
2231 if (copy_null)
2232 targetlen++;
2233 if (!target) {
2234 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2235 PyErr_NoMemory();
2236 return NULL;
2237 }
2238 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2239 if (!target) {
2240 PyErr_NoMemory();
2241 return NULL;
2242 }
2243 }
2244 else {
2245 if (targetsize < targetlen) {
2246 PyErr_Format(PyExc_SystemError,
2247 "string is longer than the buffer");
2248 if (copy_null && 0 < targetsize)
2249 target[0] = 0;
2250 return NULL;
2251 }
2252 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002253 if (kind == PyUnicode_1BYTE_KIND) {
2254 Py_UCS1 *start = (Py_UCS1 *) data;
2255 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002257 else if (kind == PyUnicode_2BYTE_KIND) {
2258 Py_UCS2 *start = (Py_UCS2 *) data;
2259 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2260 }
2261 else {
2262 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 if (copy_null)
2266 target[len] = 0;
2267 return target;
2268}
2269
2270Py_UCS4*
2271PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2272 int copy_null)
2273{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002274 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 PyErr_BadInternalCall();
2276 return NULL;
2277 }
2278 return as_ucs4(string, target, targetsize, copy_null);
2279}
2280
2281Py_UCS4*
2282PyUnicode_AsUCS4Copy(PyObject *string)
2283{
2284 return as_ucs4(string, NULL, 0, 1);
2285}
2286
2287#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002288
Alexander Belopolsky40018472011-02-26 01:02:56 +00002289PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002290PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002294 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002295 PyErr_BadInternalCall();
2296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 }
2298
Martin v. Löwis790465f2008-04-05 20:41:37 +00002299 if (size == -1) {
2300 size = wcslen(w);
2301 }
2302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002303 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304}
2305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002307
Walter Dörwald346737f2007-05-31 10:44:43 +00002308static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002309makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002310 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002311{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002312 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002313 if (longflag)
2314 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002315 else if (longlongflag) {
2316 /* longlongflag should only ever be nonzero on machines with
2317 HAVE_LONG_LONG defined */
2318#ifdef HAVE_LONG_LONG
2319 char *f = PY_FORMAT_LONG_LONG;
2320 while (*f)
2321 *fmt++ = *f++;
2322#else
2323 /* we shouldn't ever get here */
2324 assert(0);
2325 *fmt++ = 'l';
2326#endif
2327 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002328 else if (size_tflag) {
2329 char *f = PY_FORMAT_SIZE_T;
2330 while (*f)
2331 *fmt++ = *f++;
2332 }
2333 *fmt++ = c;
2334 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002335}
2336
Victor Stinner15a11362012-10-06 23:48:20 +02002337/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002338 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2339 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2340#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002341
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002342static int
2343unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2344 Py_ssize_t width, Py_ssize_t precision)
2345{
2346 Py_ssize_t length, fill, arglen;
2347 Py_UCS4 maxchar;
2348
2349 if (PyUnicode_READY(str) == -1)
2350 return -1;
2351
2352 length = PyUnicode_GET_LENGTH(str);
2353 if ((precision == -1 || precision >= length)
2354 && width <= length)
2355 return _PyUnicodeWriter_WriteStr(writer, str);
2356
2357 if (precision != -1)
2358 length = Py_MIN(precision, length);
2359
2360 arglen = Py_MAX(length, width);
2361 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2362 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2363 else
2364 maxchar = writer->maxchar;
2365
2366 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2367 return -1;
2368
2369 if (width > length) {
2370 fill = width - length;
2371 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2372 return -1;
2373 writer->pos += fill;
2374 }
2375
2376 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2377 str, 0, length);
2378 writer->pos += length;
2379 return 0;
2380}
2381
2382static int
2383unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2384 Py_ssize_t width, Py_ssize_t precision)
2385{
2386 /* UTF-8 */
2387 Py_ssize_t length;
2388 PyObject *unicode;
2389 int res;
2390
2391 length = strlen(str);
2392 if (precision != -1)
2393 length = Py_MIN(length, precision);
2394 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2395 if (unicode == NULL)
2396 return -1;
2397
2398 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2399 Py_DECREF(unicode);
2400 return res;
2401}
2402
Victor Stinner96865452011-03-01 23:44:09 +00002403static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002404unicode_fromformat_arg(_PyUnicodeWriter *writer,
2405 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002406{
Victor Stinnere215d962012-10-06 23:03:36 +02002407 const char *p;
2408 Py_ssize_t len;
2409 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002410 Py_ssize_t width;
2411 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002412 int longflag;
2413 int longlongflag;
2414 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002415 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002416
2417 p = f;
2418 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002419 zeropad = 0;
2420 if (*f == '0') {
2421 zeropad = 1;
2422 f++;
2423 }
Victor Stinner96865452011-03-01 23:44:09 +00002424
2425 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002426 width = -1;
2427 if (Py_ISDIGIT((unsigned)*f)) {
2428 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002429 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002430 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002431 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002432 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002433 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002434 return NULL;
2435 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002436 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002437 f++;
2438 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 }
2440 precision = -1;
2441 if (*f == '.') {
2442 f++;
2443 if (Py_ISDIGIT((unsigned)*f)) {
2444 precision = (*f - '0');
2445 f++;
2446 while (Py_ISDIGIT((unsigned)*f)) {
2447 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2448 PyErr_SetString(PyExc_ValueError,
2449 "precision too big");
2450 return NULL;
2451 }
2452 precision = (precision * 10) + (*f - '0');
2453 f++;
2454 }
2455 }
Victor Stinner96865452011-03-01 23:44:09 +00002456 if (*f == '%') {
2457 /* "%.3%s" => f points to "3" */
2458 f--;
2459 }
2460 }
2461 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002462 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002463 f--;
2464 }
Victor Stinner96865452011-03-01 23:44:09 +00002465
2466 /* Handle %ld, %lu, %lld and %llu. */
2467 longflag = 0;
2468 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002469 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002470 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002471 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002472 longflag = 1;
2473 ++f;
2474 }
2475#ifdef HAVE_LONG_LONG
2476 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002477 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002478 longlongflag = 1;
2479 f += 2;
2480 }
2481#endif
2482 }
2483 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002484 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002485 size_tflag = 1;
2486 ++f;
2487 }
Victor Stinnere215d962012-10-06 23:03:36 +02002488
2489 if (f[1] == '\0')
2490 writer->overallocate = 0;
2491
2492 switch (*f) {
2493 case 'c':
2494 {
2495 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002496 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002497 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002498 "character argument not in range(0x110000)");
2499 return NULL;
2500 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002501 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002502 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002503 break;
2504 }
2505
2506 case 'i':
2507 case 'd':
2508 case 'u':
2509 case 'x':
2510 {
2511 /* used by sprintf */
2512 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002513 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002514 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002515
2516 if (*f == 'u') {
2517 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2518
2519 if (longflag)
2520 len = sprintf(buffer, fmt,
2521 va_arg(*vargs, unsigned long));
2522#ifdef HAVE_LONG_LONG
2523 else if (longlongflag)
2524 len = sprintf(buffer, fmt,
2525 va_arg(*vargs, unsigned PY_LONG_LONG));
2526#endif
2527 else if (size_tflag)
2528 len = sprintf(buffer, fmt,
2529 va_arg(*vargs, size_t));
2530 else
2531 len = sprintf(buffer, fmt,
2532 va_arg(*vargs, unsigned int));
2533 }
2534 else if (*f == 'x') {
2535 makefmt(fmt, 0, 0, 0, 'x');
2536 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2537 }
2538 else {
2539 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2540
2541 if (longflag)
2542 len = sprintf(buffer, fmt,
2543 va_arg(*vargs, long));
2544#ifdef HAVE_LONG_LONG
2545 else if (longlongflag)
2546 len = sprintf(buffer, fmt,
2547 va_arg(*vargs, PY_LONG_LONG));
2548#endif
2549 else if (size_tflag)
2550 len = sprintf(buffer, fmt,
2551 va_arg(*vargs, Py_ssize_t));
2552 else
2553 len = sprintf(buffer, fmt,
2554 va_arg(*vargs, int));
2555 }
2556 assert(len >= 0);
2557
Victor Stinnere215d962012-10-06 23:03:36 +02002558 if (precision < len)
2559 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002560
2561 arglen = Py_MAX(precision, width);
2562 assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127);
2563 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2564 return NULL;
2565
Victor Stinnere215d962012-10-06 23:03:36 +02002566 if (width > precision) {
2567 Py_UCS4 fillchar;
2568 fill = width - precision;
2569 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002570 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2571 return NULL;
2572 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002573 }
Victor Stinner15a11362012-10-06 23:48:20 +02002574 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002575 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002576 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2577 return NULL;
2578 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002579 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580
2581 unicode_write_cstr(writer->buffer, writer->pos, buffer, len);
2582 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002583 break;
2584 }
2585
2586 case 'p':
2587 {
2588 char number[MAX_LONG_LONG_CHARS];
2589
2590 len = sprintf(number, "%p", va_arg(*vargs, void*));
2591 assert(len >= 0);
2592
2593 /* %p is ill-defined: ensure leading 0x. */
2594 if (number[1] == 'X')
2595 number[1] = 'x';
2596 else if (number[1] != 'x') {
2597 memmove(number + 2, number,
2598 strlen(number) + 1);
2599 number[0] = '0';
2600 number[1] = 'x';
2601 len += 2;
2602 }
2603
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127);
2605 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002606 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002607 unicode_write_cstr(writer->buffer, writer->pos, number, len);
2608 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002609 break;
2610 }
2611
2612 case 's':
2613 {
2614 /* UTF-8 */
2615 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002617 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002618 break;
2619 }
2620
2621 case 'U':
2622 {
2623 PyObject *obj = va_arg(*vargs, PyObject *);
2624 assert(obj && _PyUnicode_CHECK(obj));
2625
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002626 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002627 return NULL;
2628 break;
2629 }
2630
2631 case 'V':
2632 {
2633 PyObject *obj = va_arg(*vargs, PyObject *);
2634 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002635 if (obj) {
2636 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002637 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002638 return NULL;
2639 }
2640 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002641 assert(str != NULL);
2642 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002643 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002644 }
2645 break;
2646 }
2647
2648 case 'S':
2649 {
2650 PyObject *obj = va_arg(*vargs, PyObject *);
2651 PyObject *str;
2652 assert(obj);
2653 str = PyObject_Str(obj);
2654 if (!str)
2655 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002656 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002657 Py_DECREF(str);
2658 return NULL;
2659 }
2660 Py_DECREF(str);
2661 break;
2662 }
2663
2664 case 'R':
2665 {
2666 PyObject *obj = va_arg(*vargs, PyObject *);
2667 PyObject *repr;
2668 assert(obj);
2669 repr = PyObject_Repr(obj);
2670 if (!repr)
2671 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002672 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002673 Py_DECREF(repr);
2674 return NULL;
2675 }
2676 Py_DECREF(repr);
2677 break;
2678 }
2679
2680 case 'A':
2681 {
2682 PyObject *obj = va_arg(*vargs, PyObject *);
2683 PyObject *ascii;
2684 assert(obj);
2685 ascii = PyObject_ASCII(obj);
2686 if (!ascii)
2687 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002688 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002689 Py_DECREF(ascii);
2690 return NULL;
2691 }
2692 Py_DECREF(ascii);
2693 break;
2694 }
2695
2696 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002697 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002698 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002699 break;
2700
2701 default:
2702 /* if we stumble upon an unknown formatting code, copy the rest
2703 of the format string to the output string. (we cannot just
2704 skip the code, since there's no way to know what's in the
2705 argument list) */
2706 len = strlen(p);
2707 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2708 return NULL;
2709 f = p+len;
2710 return f;
2711 }
2712
2713 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002714 return f;
2715}
2716
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717PyObject *
2718PyUnicode_FromFormatV(const char *format, va_list vargs)
2719{
Victor Stinnere215d962012-10-06 23:03:36 +02002720 va_list vargs2;
2721 const char *f;
2722 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002723
Victor Stinner8f674cc2013-04-17 23:02:17 +02002724 _PyUnicodeWriter_Init(&writer);
2725 writer.min_length = strlen(format) + 100;
2726 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002727
2728 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2729 Copy it to be able to pass a reference to a subfunction. */
2730 Py_VA_COPY(vargs2, vargs);
2731
2732 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002733 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002734 f = unicode_fromformat_arg(&writer, f, &vargs2);
2735 if (f == NULL)
2736 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002739 const char *p;
2740 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002741
Victor Stinnere215d962012-10-06 23:03:36 +02002742 p = f;
2743 do
2744 {
2745 if ((unsigned char)*p > 127) {
2746 PyErr_Format(PyExc_ValueError,
2747 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2748 "string, got a non-ASCII byte: 0x%02x",
2749 (unsigned char)*p);
2750 return NULL;
2751 }
2752 p++;
2753 }
2754 while (*p != '\0' && *p != '%');
2755 len = p - f;
2756
2757 if (*p == '\0')
2758 writer.overallocate = 0;
2759 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2760 goto fail;
2761 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2762 writer.pos += len;
2763
2764 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002765 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 }
Victor Stinnere215d962012-10-06 23:03:36 +02002767 return _PyUnicodeWriter_Finish(&writer);
2768
2769 fail:
2770 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002771 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002772}
2773
Walter Dörwaldd2034312007-05-18 16:29:38 +00002774PyObject *
2775PyUnicode_FromFormat(const char *format, ...)
2776{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 PyObject* ret;
2778 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002779
2780#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002781 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002784#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002785 ret = PyUnicode_FromFormatV(format, vargs);
2786 va_end(vargs);
2787 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002788}
2789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002790#ifdef HAVE_WCHAR_H
2791
Victor Stinner5593d8a2010-10-02 11:11:27 +00002792/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2793 convert a Unicode object to a wide character string.
2794
Victor Stinnerd88d9832011-09-06 02:00:05 +02002795 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002796 character) required to convert the unicode object. Ignore size argument.
2797
Victor Stinnerd88d9832011-09-06 02:00:05 +02002798 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002799 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002800 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002801static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002802unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002803 wchar_t *w,
2804 Py_ssize_t size)
2805{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002806 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 const wchar_t *wstr;
2808
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002809 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 if (wstr == NULL)
2811 return -1;
2812
Victor Stinner5593d8a2010-10-02 11:11:27 +00002813 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814 if (size > res)
2815 size = res + 1;
2816 else
2817 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002819 return res;
2820 }
2821 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002822 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002823}
2824
2825Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002826PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002827 wchar_t *w,
2828 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829{
2830 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002831 PyErr_BadInternalCall();
2832 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002834 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835}
2836
Victor Stinner137c34c2010-09-29 10:25:54 +00002837wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002838PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002839 Py_ssize_t *size)
2840{
2841 wchar_t* buffer;
2842 Py_ssize_t buflen;
2843
2844 if (unicode == NULL) {
2845 PyErr_BadInternalCall();
2846 return NULL;
2847 }
2848
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002849 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002850 if (buflen == -1)
2851 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002852 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002853 PyErr_NoMemory();
2854 return NULL;
2855 }
2856
Victor Stinner137c34c2010-09-29 10:25:54 +00002857 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2858 if (buffer == NULL) {
2859 PyErr_NoMemory();
2860 return NULL;
2861 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002862 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002863 if (buflen == -1) {
2864 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002865 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002866 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002867 if (size != NULL)
2868 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002869 return buffer;
2870}
2871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002872#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873
Alexander Belopolsky40018472011-02-26 01:02:56 +00002874PyObject *
2875PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002877 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002878 void *data;
2879 int kind;
2880
Victor Stinner8faf8212011-12-08 22:14:11 +01002881 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002882 PyErr_SetString(PyExc_ValueError,
2883 "chr() arg not in range(0x110000)");
2884 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002885 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002886
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002887 if ((Py_UCS4)ordinal < 256)
2888 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 v = PyUnicode_New(1, ordinal);
2891 if (v == NULL)
2892 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002893 kind = PyUnicode_KIND(v);
2894 data = PyUnicode_DATA(v);
2895 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002896 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002897 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002898}
2899
Alexander Belopolsky40018472011-02-26 01:02:56 +00002900PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002901PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002903 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002904 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002905 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002906 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002907 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002908 Py_INCREF(obj);
2909 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002910 }
2911 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 /* For a Unicode subtype that's not a Unicode object,
2913 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002914 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002915 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002916 PyErr_Format(PyExc_TypeError,
2917 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002918 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002919 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002920}
2921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002923PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002924 const char *encoding,
2925 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002926{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002927 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002928 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002929
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 PyErr_BadInternalCall();
2932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002934
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002935 /* Decoding bytes objects is the most common case and should be fast */
2936 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002937 if (PyBytes_GET_SIZE(obj) == 0)
2938 _Py_RETURN_UNICODE_EMPTY();
2939 v = PyUnicode_Decode(
2940 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2941 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002942 return v;
2943 }
2944
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002945 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002946 PyErr_SetString(PyExc_TypeError,
2947 "decoding str is not supported");
2948 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002950
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002951 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2952 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2953 PyErr_Format(PyExc_TypeError,
2954 "coercing to str: need bytes, bytearray "
2955 "or buffer-like object, %.80s found",
2956 Py_TYPE(obj)->tp_name);
2957 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002958 }
Tim Petersced69f82003-09-16 20:30:58 +00002959
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002960 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002961 PyBuffer_Release(&buffer);
2962 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002964
Serhiy Storchaka05997252013-01-26 12:14:02 +02002965 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002966 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002967 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968}
2969
Victor Stinner600d3be2010-06-10 12:00:55 +00002970/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002971 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2972 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002973int
2974_Py_normalize_encoding(const char *encoding,
2975 char *lower,
2976 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002978 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002979 char *l;
2980 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002981
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002982 if (encoding == NULL) {
2983 strcpy(lower, "utf-8");
2984 return 1;
2985 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002986 e = encoding;
2987 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002988 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002989 while (*e) {
2990 if (l == l_end)
2991 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002992 if (Py_ISUPPER(*e)) {
2993 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002994 }
2995 else if (*e == '_') {
2996 *l++ = '-';
2997 e++;
2998 }
2999 else {
3000 *l++ = *e++;
3001 }
3002 }
3003 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003004 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003005}
3006
Alexander Belopolsky40018472011-02-26 01:02:56 +00003007PyObject *
3008PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003009 Py_ssize_t size,
3010 const char *encoding,
3011 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003012{
3013 PyObject *buffer = NULL, *unicode;
3014 Py_buffer info;
3015 char lower[11]; /* Enough for any encoding shortcut */
3016
Fred Drakee4315f52000-05-09 19:53:39 +00003017 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003018 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003019 if ((strcmp(lower, "utf-8") == 0) ||
3020 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003021 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003022 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003023 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003024 (strcmp(lower, "iso-8859-1") == 0))
3025 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003026#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003027 else if (strcmp(lower, "mbcs") == 0)
3028 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003029#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003030 else if (strcmp(lower, "ascii") == 0)
3031 return PyUnicode_DecodeASCII(s, size, errors);
3032 else if (strcmp(lower, "utf-16") == 0)
3033 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3034 else if (strcmp(lower, "utf-32") == 0)
3035 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037
3038 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003039 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003040 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003041 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003042 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 if (buffer == NULL)
3044 goto onError;
3045 unicode = PyCodec_Decode(buffer, encoding, errors);
3046 if (unicode == NULL)
3047 goto onError;
3048 if (!PyUnicode_Check(unicode)) {
3049 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003050 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003051 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 Py_DECREF(unicode);
3053 goto onError;
3054 }
3055 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003056 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003057
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 Py_XDECREF(buffer);
3060 return NULL;
3061}
3062
Alexander Belopolsky40018472011-02-26 01:02:56 +00003063PyObject *
3064PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003065 const char *encoding,
3066 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003067{
3068 PyObject *v;
3069
3070 if (!PyUnicode_Check(unicode)) {
3071 PyErr_BadArgument();
3072 goto onError;
3073 }
3074
3075 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
3078 /* Decode via the codec registry */
3079 v = PyCodec_Decode(unicode, encoding, errors);
3080 if (v == NULL)
3081 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003082 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003083
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003085 return NULL;
3086}
3087
Alexander Belopolsky40018472011-02-26 01:02:56 +00003088PyObject *
3089PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003090 const char *encoding,
3091 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003092{
3093 PyObject *v;
3094
3095 if (!PyUnicode_Check(unicode)) {
3096 PyErr_BadArgument();
3097 goto onError;
3098 }
3099
3100 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003101 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003102
3103 /* Decode via the codec registry */
3104 v = PyCodec_Decode(unicode, encoding, errors);
3105 if (v == NULL)
3106 goto onError;
3107 if (!PyUnicode_Check(v)) {
3108 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003109 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003110 Py_TYPE(v)->tp_name);
3111 Py_DECREF(v);
3112 goto onError;
3113 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003114 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003115
Benjamin Peterson29060642009-01-31 22:14:21 +00003116 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003117 return NULL;
3118}
3119
Alexander Belopolsky40018472011-02-26 01:02:56 +00003120PyObject *
3121PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003122 Py_ssize_t size,
3123 const char *encoding,
3124 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125{
3126 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003127
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 unicode = PyUnicode_FromUnicode(s, size);
3129 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3132 Py_DECREF(unicode);
3133 return v;
3134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 const char *encoding,
3139 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003140{
3141 PyObject *v;
3142
3143 if (!PyUnicode_Check(unicode)) {
3144 PyErr_BadArgument();
3145 goto onError;
3146 }
3147
3148 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003150
3151 /* Encode via the codec registry */
3152 v = PyCodec_Encode(unicode, encoding, errors);
3153 if (v == NULL)
3154 goto onError;
3155 return v;
3156
Benjamin Peterson29060642009-01-31 22:14:21 +00003157 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003158 return NULL;
3159}
3160
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003161static size_t
3162wcstombs_errorpos(const wchar_t *wstr)
3163{
3164 size_t len;
3165#if SIZEOF_WCHAR_T == 2
3166 wchar_t buf[3];
3167#else
3168 wchar_t buf[2];
3169#endif
3170 char outbuf[MB_LEN_MAX];
3171 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003173#if SIZEOF_WCHAR_T == 2
3174 buf[2] = 0;
3175#else
3176 buf[1] = 0;
3177#endif
3178 start = wstr;
3179 while (*wstr != L'\0')
3180 {
3181 previous = wstr;
3182#if SIZEOF_WCHAR_T == 2
3183 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3184 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3185 {
3186 buf[0] = wstr[0];
3187 buf[1] = wstr[1];
3188 wstr += 2;
3189 }
3190 else {
3191 buf[0] = *wstr;
3192 buf[1] = 0;
3193 wstr++;
3194 }
3195#else
3196 buf[0] = *wstr;
3197 wstr++;
3198#endif
3199 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003200 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003201 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003202 }
3203
3204 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003205 return 0;
3206}
3207
Victor Stinner1b579672011-12-17 05:47:23 +01003208static int
3209locale_error_handler(const char *errors, int *surrogateescape)
3210{
3211 if (errors == NULL) {
3212 *surrogateescape = 0;
3213 return 0;
3214 }
3215
3216 if (strcmp(errors, "strict") == 0) {
3217 *surrogateescape = 0;
3218 return 0;
3219 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003220 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003221 *surrogateescape = 1;
3222 return 0;
3223 }
3224 PyErr_Format(PyExc_ValueError,
3225 "only 'strict' and 'surrogateescape' error handlers "
3226 "are supported, not '%s'",
3227 errors);
3228 return -1;
3229}
3230
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003231PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003232PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003233{
3234 Py_ssize_t wlen, wlen2;
3235 wchar_t *wstr;
3236 PyObject *bytes = NULL;
3237 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003238 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003239 PyObject *exc;
3240 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003241 int surrogateescape;
3242
3243 if (locale_error_handler(errors, &surrogateescape) < 0)
3244 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003245
3246 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3247 if (wstr == NULL)
3248 return NULL;
3249
3250 wlen2 = wcslen(wstr);
3251 if (wlen2 != wlen) {
3252 PyMem_Free(wstr);
3253 PyErr_SetString(PyExc_TypeError, "embedded null character");
3254 return NULL;
3255 }
3256
3257 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003258 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003259 char *str;
3260
3261 str = _Py_wchar2char(wstr, &error_pos);
3262 if (str == NULL) {
3263 if (error_pos == (size_t)-1) {
3264 PyErr_NoMemory();
3265 PyMem_Free(wstr);
3266 return NULL;
3267 }
3268 else {
3269 goto encode_error;
3270 }
3271 }
3272 PyMem_Free(wstr);
3273
3274 bytes = PyBytes_FromString(str);
3275 PyMem_Free(str);
3276 }
3277 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003278 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279 size_t len, len2;
3280
3281 len = wcstombs(NULL, wstr, 0);
3282 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003283 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003284 goto encode_error;
3285 }
3286
3287 bytes = PyBytes_FromStringAndSize(NULL, len);
3288 if (bytes == NULL) {
3289 PyMem_Free(wstr);
3290 return NULL;
3291 }
3292
3293 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3294 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003295 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003296 goto encode_error;
3297 }
3298 PyMem_Free(wstr);
3299 }
3300 return bytes;
3301
3302encode_error:
3303 errmsg = strerror(errno);
3304 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003305
3306 if (error_pos == (size_t)-1)
3307 error_pos = wcstombs_errorpos(wstr);
3308
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003309 PyMem_Free(wstr);
3310 Py_XDECREF(bytes);
3311
Victor Stinner2f197072011-12-17 07:08:30 +01003312 if (errmsg != NULL) {
3313 size_t errlen;
3314 wstr = _Py_char2wchar(errmsg, &errlen);
3315 if (wstr != NULL) {
3316 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003317 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003318 } else
3319 errmsg = NULL;
3320 }
3321 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003322 reason = PyUnicode_FromString(
3323 "wcstombs() encountered an unencodable "
3324 "wide character");
3325 if (reason == NULL)
3326 return NULL;
3327
3328 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3329 "locale", unicode,
3330 (Py_ssize_t)error_pos,
3331 (Py_ssize_t)(error_pos+1),
3332 reason);
3333 Py_DECREF(reason);
3334 if (exc != NULL) {
3335 PyCodec_StrictErrors(exc);
3336 Py_XDECREF(exc);
3337 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003338 return NULL;
3339}
3340
Victor Stinnerad158722010-10-27 00:25:46 +00003341PyObject *
3342PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003343{
Victor Stinner99b95382011-07-04 14:23:54 +02003344#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003345 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003346#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003347 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003348#else
Victor Stinner793b5312011-04-27 00:24:21 +02003349 PyInterpreterState *interp = PyThreadState_GET()->interp;
3350 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3351 cannot use it to encode and decode filenames before it is loaded. Load
3352 the Python codec requires to encode at least its own filename. Use the C
3353 version of the locale codec until the codec registry is initialized and
3354 the Python codec is loaded.
3355
3356 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3357 cannot only rely on it: check also interp->fscodec_initialized for
3358 subinterpreters. */
3359 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003360 return PyUnicode_AsEncodedString(unicode,
3361 Py_FileSystemDefaultEncoding,
3362 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003363 }
3364 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003365 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003366 }
Victor Stinnerad158722010-10-27 00:25:46 +00003367#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003368}
3369
Alexander Belopolsky40018472011-02-26 01:02:56 +00003370PyObject *
3371PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003372 const char *encoding,
3373 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374{
3375 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003376 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003377
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 if (!PyUnicode_Check(unicode)) {
3379 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381 }
Fred Drakee4315f52000-05-09 19:53:39 +00003382
Fred Drakee4315f52000-05-09 19:53:39 +00003383 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003384 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003385 if ((strcmp(lower, "utf-8") == 0) ||
3386 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003387 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003388 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003389 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003390 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003391 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003392 }
Victor Stinner37296e82010-06-10 13:36:23 +00003393 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003394 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003395 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003396 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003397#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003398 else if (strcmp(lower, "mbcs") == 0)
3399 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003400#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003401 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003402 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003403 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404
3405 /* Encode via the codec registry */
3406 v = PyCodec_Encode(unicode, encoding, errors);
3407 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003408 return NULL;
3409
3410 /* The normal path */
3411 if (PyBytes_Check(v))
3412 return v;
3413
3414 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003415 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003416 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003417 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003418
3419 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3420 "encoder %s returned bytearray instead of bytes",
3421 encoding);
3422 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003423 Py_DECREF(v);
3424 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003425 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003426
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003427 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3428 Py_DECREF(v);
3429 return b;
3430 }
3431
3432 PyErr_Format(PyExc_TypeError,
3433 "encoder did not return a bytes object (type=%.400s)",
3434 Py_TYPE(v)->tp_name);
3435 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003436 return NULL;
3437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
3440PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003443{
3444 PyObject *v;
3445
3446 if (!PyUnicode_Check(unicode)) {
3447 PyErr_BadArgument();
3448 goto onError;
3449 }
3450
3451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003453
3454 /* Encode via the codec registry */
3455 v = PyCodec_Encode(unicode, encoding, errors);
3456 if (v == NULL)
3457 goto onError;
3458 if (!PyUnicode_Check(v)) {
3459 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003460 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003461 Py_TYPE(v)->tp_name);
3462 Py_DECREF(v);
3463 goto onError;
3464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003466
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 return NULL;
3469}
3470
Victor Stinner2f197072011-12-17 07:08:30 +01003471static size_t
3472mbstowcs_errorpos(const char *str, size_t len)
3473{
3474#ifdef HAVE_MBRTOWC
3475 const char *start = str;
3476 mbstate_t mbs;
3477 size_t converted;
3478 wchar_t ch;
3479
3480 memset(&mbs, 0, sizeof mbs);
3481 while (len)
3482 {
3483 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3484 if (converted == 0)
3485 /* Reached end of string */
3486 break;
3487 if (converted == (size_t)-1 || converted == (size_t)-2) {
3488 /* Conversion error or incomplete character */
3489 return str - start;
3490 }
3491 else {
3492 str += converted;
3493 len -= converted;
3494 }
3495 }
3496 /* failed to find the undecodable byte sequence */
3497 return 0;
3498#endif
3499 return 0;
3500}
3501
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003502PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003503PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003504 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003505{
3506 wchar_t smallbuf[256];
3507 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3508 wchar_t *wstr;
3509 size_t wlen, wlen2;
3510 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003511 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003512 size_t error_pos;
3513 char *errmsg;
3514 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003515
3516 if (locale_error_handler(errors, &surrogateescape) < 0)
3517 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003518
3519 if (str[len] != '\0' || len != strlen(str)) {
3520 PyErr_SetString(PyExc_TypeError, "embedded null character");
3521 return NULL;
3522 }
3523
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003524 if (surrogateescape) {
3525 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003526 wstr = _Py_char2wchar(str, &wlen);
3527 if (wstr == NULL) {
3528 if (wlen == (size_t)-1)
3529 PyErr_NoMemory();
3530 else
3531 PyErr_SetFromErrno(PyExc_OSError);
3532 return NULL;
3533 }
3534
3535 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003536 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003537 }
3538 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003539 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003540#ifndef HAVE_BROKEN_MBSTOWCS
3541 wlen = mbstowcs(NULL, str, 0);
3542#else
3543 wlen = len;
3544#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003545 if (wlen == (size_t)-1)
3546 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547 if (wlen+1 <= smallbuf_len) {
3548 wstr = smallbuf;
3549 }
3550 else {
3551 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3552 return PyErr_NoMemory();
3553
3554 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3555 if (!wstr)
3556 return PyErr_NoMemory();
3557 }
3558
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003559 wlen2 = mbstowcs(wstr, str, wlen+1);
3560 if (wlen2 == (size_t)-1) {
3561 if (wstr != smallbuf)
3562 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003563 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003564 }
3565#ifdef HAVE_BROKEN_MBSTOWCS
3566 assert(wlen2 == wlen);
3567#endif
3568 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3569 if (wstr != smallbuf)
3570 PyMem_Free(wstr);
3571 }
3572 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003573
3574decode_error:
3575 errmsg = strerror(errno);
3576 assert(errmsg != NULL);
3577
3578 error_pos = mbstowcs_errorpos(str, len);
3579 if (errmsg != NULL) {
3580 size_t errlen;
3581 wstr = _Py_char2wchar(errmsg, &errlen);
3582 if (wstr != NULL) {
3583 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003584 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003585 } else
3586 errmsg = NULL;
3587 }
3588 if (errmsg == NULL)
3589 reason = PyUnicode_FromString(
3590 "mbstowcs() encountered an invalid multibyte sequence");
3591 if (reason == NULL)
3592 return NULL;
3593
3594 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3595 "locale", str, len,
3596 (Py_ssize_t)error_pos,
3597 (Py_ssize_t)(error_pos+1),
3598 reason);
3599 Py_DECREF(reason);
3600 if (exc != NULL) {
3601 PyCodec_StrictErrors(exc);
3602 Py_XDECREF(exc);
3603 }
3604 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003605}
3606
3607PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003608PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003609{
3610 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003611 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003612}
3613
3614
3615PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003616PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003617 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003618 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3619}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003620
Christian Heimes5894ba72007-11-04 11:43:14 +00003621PyObject*
3622PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3623{
Victor Stinner99b95382011-07-04 14:23:54 +02003624#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003625 return PyUnicode_DecodeMBCS(s, size, NULL);
3626#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003627 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003628#else
Victor Stinner793b5312011-04-27 00:24:21 +02003629 PyInterpreterState *interp = PyThreadState_GET()->interp;
3630 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3631 cannot use it to encode and decode filenames before it is loaded. Load
3632 the Python codec requires to encode at least its own filename. Use the C
3633 version of the locale codec until the codec registry is initialized and
3634 the Python codec is loaded.
3635
3636 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3637 cannot only rely on it: check also interp->fscodec_initialized for
3638 subinterpreters. */
3639 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003640 return PyUnicode_Decode(s, size,
3641 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003642 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003643 }
3644 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003645 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003646 }
Victor Stinnerad158722010-10-27 00:25:46 +00003647#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003648}
3649
Martin v. Löwis011e8422009-05-05 04:43:17 +00003650
3651int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003652_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003653{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003654 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003655
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003656 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003657 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003658 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3659 PyUnicode_GET_LENGTH(str), '\0', 1);
3660 if (pos == -1)
3661 return 0;
3662 else
3663 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003664}
3665
Antoine Pitrou13348842012-01-29 18:36:34 +01003666int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003667PyUnicode_FSConverter(PyObject* arg, void* addr)
3668{
3669 PyObject *output = NULL;
3670 Py_ssize_t size;
3671 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003672 if (arg == NULL) {
3673 Py_DECREF(*(PyObject**)addr);
3674 return 1;
3675 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003676 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003677 output = arg;
3678 Py_INCREF(output);
3679 }
3680 else {
3681 arg = PyUnicode_FromObject(arg);
3682 if (!arg)
3683 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003684 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003685 Py_DECREF(arg);
3686 if (!output)
3687 return 0;
3688 if (!PyBytes_Check(output)) {
3689 Py_DECREF(output);
3690 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3691 return 0;
3692 }
3693 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003694 size = PyBytes_GET_SIZE(output);
3695 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003696 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003697 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003698 Py_DECREF(output);
3699 return 0;
3700 }
3701 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003702 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003703}
3704
3705
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003706int
3707PyUnicode_FSDecoder(PyObject* arg, void* addr)
3708{
3709 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003710 if (arg == NULL) {
3711 Py_DECREF(*(PyObject**)addr);
3712 return 1;
3713 }
3714 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003715 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003717 output = arg;
3718 Py_INCREF(output);
3719 }
3720 else {
3721 arg = PyBytes_FromObject(arg);
3722 if (!arg)
3723 return 0;
3724 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3725 PyBytes_GET_SIZE(arg));
3726 Py_DECREF(arg);
3727 if (!output)
3728 return 0;
3729 if (!PyUnicode_Check(output)) {
3730 Py_DECREF(output);
3731 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3732 return 0;
3733 }
3734 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003735 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003736 Py_DECREF(output);
3737 return 0;
3738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003739 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003740 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003741 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3742 Py_DECREF(output);
3743 return 0;
3744 }
3745 *(PyObject**)addr = output;
3746 return Py_CLEANUP_SUPPORTED;
3747}
3748
3749
Martin v. Löwis5b222132007-06-10 09:51:05 +00003750char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003752{
Christian Heimesf3863112007-11-22 07:46:41 +00003753 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003754
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003755 if (!PyUnicode_Check(unicode)) {
3756 PyErr_BadArgument();
3757 return NULL;
3758 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003760 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003762 if (PyUnicode_UTF8(unicode) == NULL) {
3763 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3765 if (bytes == NULL)
3766 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003767 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3768 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 Py_DECREF(bytes);
3770 return NULL;
3771 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3773 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3774 PyBytes_AS_STRING(bytes),
3775 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 Py_DECREF(bytes);
3777 }
3778
3779 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003780 *psize = PyUnicode_UTF8_LENGTH(unicode);
3781 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003782}
3783
3784char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3788}
3789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790Py_UNICODE *
3791PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 const unsigned char *one_byte;
3794#if SIZEOF_WCHAR_T == 4
3795 const Py_UCS2 *two_bytes;
3796#else
3797 const Py_UCS4 *four_bytes;
3798 const Py_UCS4 *ucs4_end;
3799 Py_ssize_t num_surrogates;
3800#endif
3801 wchar_t *w;
3802 wchar_t *wchar_end;
3803
3804 if (!PyUnicode_Check(unicode)) {
3805 PyErr_BadArgument();
3806 return NULL;
3807 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003808 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 assert(_PyUnicode_KIND(unicode) != 0);
3811 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3816 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 num_surrogates = 0;
3818
3819 for (; four_bytes < ucs4_end; ++four_bytes) {
3820 if (*four_bytes > 0xFFFF)
3821 ++num_surrogates;
3822 }
3823
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003824 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3825 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3826 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 PyErr_NoMemory();
3828 return NULL;
3829 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 w = _PyUnicode_WSTR(unicode);
3833 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3834 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3836 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003837 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003839 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3840 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 }
3842 else
3843 *w = *four_bytes;
3844
3845 if (w > wchar_end) {
3846 assert(0 && "Miscalculated string end");
3847 }
3848 }
3849 *w = 0;
3850#else
3851 /* sizeof(wchar_t) == 4 */
3852 Py_FatalError("Impossible unicode object state, wstr and str "
3853 "should share memory already.");
3854 return NULL;
3855#endif
3856 }
3857 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003858 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3859 (_PyUnicode_LENGTH(unicode) + 1));
3860 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 PyErr_NoMemory();
3862 return NULL;
3863 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003864 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3865 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3866 w = _PyUnicode_WSTR(unicode);
3867 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003869 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3870 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 for (; w < wchar_end; ++one_byte, ++w)
3872 *w = *one_byte;
3873 /* null-terminate the wstr */
3874 *w = 0;
3875 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003876 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 for (; w < wchar_end; ++two_bytes, ++w)
3880 *w = *two_bytes;
3881 /* null-terminate the wstr */
3882 *w = 0;
3883#else
3884 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003885 PyObject_FREE(_PyUnicode_WSTR(unicode));
3886 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 Py_FatalError("Impossible unicode object state, wstr "
3888 "and str should share memory already.");
3889 return NULL;
3890#endif
3891 }
3892 else {
3893 assert(0 && "This should never happen.");
3894 }
3895 }
3896 }
3897 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003898 *size = PyUnicode_WSTR_LENGTH(unicode);
3899 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003900}
3901
Alexander Belopolsky40018472011-02-26 01:02:56 +00003902Py_UNICODE *
3903PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906}
3907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908
Alexander Belopolsky40018472011-02-26 01:02:56 +00003909Py_ssize_t
3910PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911{
3912 if (!PyUnicode_Check(unicode)) {
3913 PyErr_BadArgument();
3914 goto onError;
3915 }
3916 return PyUnicode_GET_SIZE(unicode);
3917
Benjamin Peterson29060642009-01-31 22:14:21 +00003918 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 return -1;
3920}
3921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922Py_ssize_t
3923PyUnicode_GetLength(PyObject *unicode)
3924{
Victor Stinner07621332012-06-16 04:53:46 +02003925 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 PyErr_BadArgument();
3927 return -1;
3928 }
Victor Stinner07621332012-06-16 04:53:46 +02003929 if (PyUnicode_READY(unicode) == -1)
3930 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 return PyUnicode_GET_LENGTH(unicode);
3932}
3933
3934Py_UCS4
3935PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3936{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003937 void *data;
3938 int kind;
3939
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003940 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3941 PyErr_BadArgument();
3942 return (Py_UCS4)-1;
3943 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003944 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003945 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 return (Py_UCS4)-1;
3947 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003948 data = PyUnicode_DATA(unicode);
3949 kind = PyUnicode_KIND(unicode);
3950 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951}
3952
3953int
3954PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3955{
3956 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003957 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958 return -1;
3959 }
Victor Stinner488fa492011-12-12 00:01:39 +01003960 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003961 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003962 PyErr_SetString(PyExc_IndexError, "string index out of range");
3963 return -1;
3964 }
Victor Stinner488fa492011-12-12 00:01:39 +01003965 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003966 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003967 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3968 PyErr_SetString(PyExc_ValueError, "character out of range");
3969 return -1;
3970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3972 index, ch);
3973 return 0;
3974}
3975
Alexander Belopolsky40018472011-02-26 01:02:56 +00003976const char *
3977PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003978{
Victor Stinner42cb4622010-09-01 19:39:01 +00003979 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003980}
3981
Victor Stinner554f3f02010-06-16 23:33:54 +00003982/* create or adjust a UnicodeDecodeError */
3983static void
3984make_decode_exception(PyObject **exceptionObject,
3985 const char *encoding,
3986 const char *input, Py_ssize_t length,
3987 Py_ssize_t startpos, Py_ssize_t endpos,
3988 const char *reason)
3989{
3990 if (*exceptionObject == NULL) {
3991 *exceptionObject = PyUnicodeDecodeError_Create(
3992 encoding, input, length, startpos, endpos, reason);
3993 }
3994 else {
3995 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3996 goto onError;
3997 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3998 goto onError;
3999 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4000 goto onError;
4001 }
4002 return;
4003
4004onError:
4005 Py_DECREF(*exceptionObject);
4006 *exceptionObject = NULL;
4007}
4008
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004009#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010/* error handling callback helper:
4011 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004012 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 and adjust various state variables.
4014 return 0 on success, -1 on error
4015*/
4016
Alexander Belopolsky40018472011-02-26 01:02:56 +00004017static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004018unicode_decode_call_errorhandler_wchar(
4019 const char *errors, PyObject **errorHandler,
4020 const char *encoding, const char *reason,
4021 const char **input, const char **inend, Py_ssize_t *startinpos,
4022 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4023 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004025 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026
4027 PyObject *restuple = NULL;
4028 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004029 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004030 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004031 Py_ssize_t requiredsize;
4032 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004033 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004034 wchar_t *repwstr;
4035 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004037 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4038 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004039
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 *errorHandler = PyCodec_LookupError(errors);
4042 if (*errorHandler == NULL)
4043 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044 }
4045
Victor Stinner554f3f02010-06-16 23:33:54 +00004046 make_decode_exception(exceptionObject,
4047 encoding,
4048 *input, *inend - *input,
4049 *startinpos, *endinpos,
4050 reason);
4051 if (*exceptionObject == NULL)
4052 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053
4054 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4055 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004058 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 }
4061 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004063
4064 /* Copy back the bytes variables, which might have been modified by the
4065 callback */
4066 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4067 if (!inputobj)
4068 goto onError;
4069 if (!PyBytes_Check(inputobj)) {
4070 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4071 }
4072 *input = PyBytes_AS_STRING(inputobj);
4073 insize = PyBytes_GET_SIZE(inputobj);
4074 *inend = *input + insize;
4075 /* we can DECREF safely, as the exception has another reference,
4076 so the object won't go away. */
4077 Py_DECREF(inputobj);
4078
4079 if (newpos<0)
4080 newpos = insize+newpos;
4081 if (newpos<0 || newpos>insize) {
4082 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4083 goto onError;
4084 }
4085
4086 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4087 if (repwstr == NULL)
4088 goto onError;
4089 /* need more space? (at least enough for what we
4090 have+the replacement+the rest of the string (starting
4091 at the new input position), so we won't have to check space
4092 when there are no errors in the rest of the string) */
4093 requiredsize = *outpos + repwlen + insize-newpos;
4094 if (requiredsize > outsize) {
4095 if (requiredsize < 2*outsize)
4096 requiredsize = 2*outsize;
4097 if (unicode_resize(output, requiredsize) < 0)
4098 goto onError;
4099 }
4100 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4101 *outpos += repwlen;
4102
4103 *endinpos = newpos;
4104 *inptr = *input + newpos;
4105
4106 /* we made it! */
4107 Py_XDECREF(restuple);
4108 return 0;
4109
4110 onError:
4111 Py_XDECREF(restuple);
4112 return -1;
4113}
4114#endif /* HAVE_MBCS */
4115
4116static int
4117unicode_decode_call_errorhandler_writer(
4118 const char *errors, PyObject **errorHandler,
4119 const char *encoding, const char *reason,
4120 const char **input, const char **inend, Py_ssize_t *startinpos,
4121 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4122 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4123{
4124 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4125
4126 PyObject *restuple = NULL;
4127 PyObject *repunicode = NULL;
4128 Py_ssize_t insize;
4129 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004130 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004131 PyObject *inputobj = NULL;
4132
4133 if (*errorHandler == NULL) {
4134 *errorHandler = PyCodec_LookupError(errors);
4135 if (*errorHandler == NULL)
4136 goto onError;
4137 }
4138
4139 make_decode_exception(exceptionObject,
4140 encoding,
4141 *input, *inend - *input,
4142 *startinpos, *endinpos,
4143 reason);
4144 if (*exceptionObject == NULL)
4145 goto onError;
4146
4147 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4148 if (restuple == NULL)
4149 goto onError;
4150 if (!PyTuple_Check(restuple)) {
4151 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4152 goto onError;
4153 }
4154 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004155 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004156
4157 /* Copy back the bytes variables, which might have been modified by the
4158 callback */
4159 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4160 if (!inputobj)
4161 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004162 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004164 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004165 *input = PyBytes_AS_STRING(inputobj);
4166 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004167 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004168 /* we can DECREF safely, as the exception has another reference,
4169 so the object won't go away. */
4170 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004171
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004174 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4176 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004177 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178
Victor Stinner8f674cc2013-04-17 23:02:17 +02004179 if (PyUnicode_READY(repunicode) < 0)
4180 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004181 replen = PyUnicode_GET_LENGTH(repunicode);
4182 writer->min_length += replen;
4183 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004184 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004185 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004186 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004189 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004192 Py_XDECREF(restuple);
4193 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004197 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198}
4199
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004200/* --- UTF-7 Codec -------------------------------------------------------- */
4201
Antoine Pitrou244651a2009-05-04 18:56:13 +00004202/* See RFC2152 for details. We encode conservatively and decode liberally. */
4203
4204/* Three simple macros defining base-64. */
4205
4206/* Is c a base-64 character? */
4207
4208#define IS_BASE64(c) \
4209 (((c) >= 'A' && (c) <= 'Z') || \
4210 ((c) >= 'a' && (c) <= 'z') || \
4211 ((c) >= '0' && (c) <= '9') || \
4212 (c) == '+' || (c) == '/')
4213
4214/* given that c is a base-64 character, what is its base-64 value? */
4215
4216#define FROM_BASE64(c) \
4217 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4218 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4219 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4220 (c) == '+' ? 62 : 63)
4221
4222/* What is the base-64 character of the bottom 6 bits of n? */
4223
4224#define TO_BASE64(n) \
4225 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4226
4227/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4228 * decoded as itself. We are permissive on decoding; the only ASCII
4229 * byte not decoding to itself is the + which begins a base64
4230 * string. */
4231
4232#define DECODE_DIRECT(c) \
4233 ((c) <= 127 && (c) != '+')
4234
4235/* The UTF-7 encoder treats ASCII characters differently according to
4236 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4237 * the above). See RFC2152. This array identifies these different
4238 * sets:
4239 * 0 : "Set D"
4240 * alphanumeric and '(),-./:?
4241 * 1 : "Set O"
4242 * !"#$%&*;<=>@[]^_`{|}
4243 * 2 : "whitespace"
4244 * ht nl cr sp
4245 * 3 : special (must be base64 encoded)
4246 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4247 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004248
Tim Petersced69f82003-09-16 20:30:58 +00004249static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004250char utf7_category[128] = {
4251/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4252 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4253/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4254 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4255/* sp ! " # $ % & ' ( ) * + , - . / */
4256 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4257/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4259/* @ A B C D E F G H I J K L M N O */
4260 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4261/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4263/* ` a b c d e f g h i j k l m n o */
4264 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4265/* p q r s t u v w x y z { | } ~ del */
4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004267};
4268
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269/* ENCODE_DIRECT: this character should be encoded as itself. The
4270 * answer depends on whether we are encoding set O as itself, and also
4271 * on whether we are encoding whitespace as itself. RFC2152 makes it
4272 * clear that the answers to these questions vary between
4273 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004274
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275#define ENCODE_DIRECT(c, directO, directWS) \
4276 ((c) < 128 && (c) > 0 && \
4277 ((utf7_category[(c)] == 0) || \
4278 (directWS && (utf7_category[(c)] == 2)) || \
4279 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280
Alexander Belopolsky40018472011-02-26 01:02:56 +00004281PyObject *
4282PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004283 Py_ssize_t size,
4284 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004286 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4287}
4288
Antoine Pitrou244651a2009-05-04 18:56:13 +00004289/* The decoder. The only state we preserve is our read position,
4290 * i.e. how many characters we have consumed. So if we end in the
4291 * middle of a shift sequence we have to back off the read position
4292 * and the output to the beginning of the sequence, otherwise we lose
4293 * all the shift state (seen bits, number of bits seen, high
4294 * surrogate). */
4295
Alexander Belopolsky40018472011-02-26 01:02:56 +00004296PyObject *
4297PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004298 Py_ssize_t size,
4299 const char *errors,
4300 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004301{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004303 Py_ssize_t startinpos;
4304 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004305 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 const char *errmsg = "";
4308 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004309 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310 unsigned int base64bits = 0;
4311 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004312 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313 PyObject *errorHandler = NULL;
4314 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004315
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004316 if (size == 0) {
4317 if (consumed)
4318 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004319 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004320 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004323 _PyUnicodeWriter_Init(&writer);
4324 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325
4326 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327 e = s + size;
4328
4329 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004330 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004332 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 if (inShift) { /* in a base-64 section */
4335 if (IS_BASE64(ch)) { /* consume a base-64 character */
4336 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4337 base64bits += 6;
4338 s++;
4339 if (base64bits >= 16) {
4340 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004341 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 base64bits -= 16;
4343 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4344 if (surrogate) {
4345 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004346 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4347 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004348 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004349 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004351 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 }
4353 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004354 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004355 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004356 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 }
4358 }
Victor Stinner551ac952011-11-29 22:58:13 +01004359 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 /* first surrogate */
4361 surrogate = outCh;
4362 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004364 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004365 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 }
4367 }
4368 }
4369 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 inShift = 0;
4371 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004373 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004374 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004375 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004376 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 if (base64bits > 0) { /* left-over bits */
4378 if (base64bits >= 6) {
4379 /* We've seen at least one base-64 character */
4380 errmsg = "partial character in shift sequence";
4381 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004382 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 else {
4384 /* Some bits remain; they should be zero */
4385 if (base64buffer != 0) {
4386 errmsg = "non-zero padding bits in shift sequence";
4387 goto utf7Error;
4388 }
4389 }
4390 }
4391 if (ch != '-') {
4392 /* '-' is absorbed; other terminating
4393 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004394 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004395 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004397 }
4398 }
4399 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004401 s++; /* consume '+' */
4402 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004404 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004405 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 }
4407 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004409 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 }
4412 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004415 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 else {
4419 startinpos = s-starts;
4420 s++;
4421 errmsg = "unexpected special character";
4422 goto utf7Error;
4423 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004427 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 errors, &errorHandler,
4429 "utf7", errmsg,
4430 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004431 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 }
4434
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435 /* end of string */
4436
4437 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4438 /* if we're in an inconsistent state, that's an error */
4439 if (surrogate ||
4440 (base64bits >= 6) ||
4441 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 errors, &errorHandler,
4445 "utf7", "unterminated shift sequence",
4446 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004447 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 goto onError;
4449 if (s < e)
4450 goto restart;
4451 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453
4454 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004455 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004457 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 }
4460 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004461 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004463 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 Py_XDECREF(errorHandler);
4466 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004467 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 Py_XDECREF(errorHandler);
4471 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004472 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473 return NULL;
4474}
4475
4476
Alexander Belopolsky40018472011-02-26 01:02:56 +00004477PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004478_PyUnicode_EncodeUTF7(PyObject *str,
4479 int base64SetO,
4480 int base64WhiteSpace,
4481 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004482{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004483 int kind;
4484 void *data;
4485 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004486 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004488 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489 unsigned int base64bits = 0;
4490 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491 char * out;
4492 char * start;
4493
Benjamin Petersonbac79492012-01-14 13:34:47 -05004494 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004495 return NULL;
4496 kind = PyUnicode_KIND(str);
4497 data = PyUnicode_DATA(str);
4498 len = PyUnicode_GET_LENGTH(str);
4499
4500 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004504 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004505 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004506 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507 if (v == NULL)
4508 return NULL;
4509
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004510 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004511 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004512 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 if (inShift) {
4515 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4516 /* shifting out */
4517 if (base64bits) { /* output remaining bits */
4518 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4519 base64buffer = 0;
4520 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521 }
4522 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 /* Characters not in the BASE64 set implicitly unshift the sequence
4524 so no '-' is required, except if the character is itself a '-' */
4525 if (IS_BASE64(ch) || ch == '-') {
4526 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528 *out++ = (char) ch;
4529 }
4530 else {
4531 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004532 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 else { /* not in a shift sequence */
4535 if (ch == '+') {
4536 *out++ = '+';
4537 *out++ = '-';
4538 }
4539 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4540 *out++ = (char) ch;
4541 }
4542 else {
4543 *out++ = '+';
4544 inShift = 1;
4545 goto encode_char;
4546 }
4547 }
4548 continue;
4549encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004550 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004551 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004552
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 /* code first surrogate */
4554 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004555 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556 while (base64bits >= 6) {
4557 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4558 base64bits -= 6;
4559 }
4560 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004561 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 base64bits += 16;
4564 base64buffer = (base64buffer << 16) | ch;
4565 while (base64bits >= 6) {
4566 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4567 base64bits -= 6;
4568 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004569 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 if (base64bits)
4571 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4572 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004574 if (_PyBytes_Resize(&v, out - start) < 0)
4575 return NULL;
4576 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004577}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004578PyObject *
4579PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4580 Py_ssize_t size,
4581 int base64SetO,
4582 int base64WhiteSpace,
4583 const char *errors)
4584{
4585 PyObject *result;
4586 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4587 if (tmp == NULL)
4588 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004589 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004590 base64WhiteSpace, errors);
4591 Py_DECREF(tmp);
4592 return result;
4593}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595#undef IS_BASE64
4596#undef FROM_BASE64
4597#undef TO_BASE64
4598#undef DECODE_DIRECT
4599#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004600
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601/* --- UTF-8 Codec -------------------------------------------------------- */
4602
Alexander Belopolsky40018472011-02-26 01:02:56 +00004603PyObject *
4604PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004605 Py_ssize_t size,
4606 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607{
Walter Dörwald69652032004-09-07 20:24:22 +00004608 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4609}
4610
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004611#include "stringlib/asciilib.h"
4612#include "stringlib/codecs.h"
4613#include "stringlib/undef.h"
4614
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004615#include "stringlib/ucs1lib.h"
4616#include "stringlib/codecs.h"
4617#include "stringlib/undef.h"
4618
4619#include "stringlib/ucs2lib.h"
4620#include "stringlib/codecs.h"
4621#include "stringlib/undef.h"
4622
4623#include "stringlib/ucs4lib.h"
4624#include "stringlib/codecs.h"
4625#include "stringlib/undef.h"
4626
Antoine Pitrouab868312009-01-10 15:40:25 +00004627/* Mask to quickly check whether a C 'long' contains a
4628 non-ASCII, UTF8-encoded char. */
4629#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004630# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004631#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004632# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004633#else
4634# error C 'long' size should be either 4 or 8!
4635#endif
4636
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004637static Py_ssize_t
4638ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004639{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004641 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004642
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004643 /*
4644 * Issue #17237: m68k is a bit different from most architectures in
4645 * that objects do not use "natural alignment" - for example, int and
4646 * long are only aligned at 2-byte boundaries. Therefore the assert()
4647 * won't work; also, tests have shown that skipping the "optimised
4648 * version" will even speed up m68k.
4649 */
4650#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004652 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4653 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654 /* Fast path, see in STRINGLIB(utf8_decode) for
4655 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004656 /* Help allocation */
4657 const char *_p = p;
4658 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004659 while (_p < aligned_end) {
4660 unsigned long value = *(const unsigned long *) _p;
4661 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663 *((unsigned long *)q) = value;
4664 _p += SIZEOF_LONG;
4665 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004666 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 p = _p;
4668 while (p < end) {
4669 if ((unsigned char)*p & 0x80)
4670 break;
4671 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004675#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004676#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 while (p < end) {
4678 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4679 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004680 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004681 /* Help allocation */
4682 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683 while (_p < aligned_end) {
4684 unsigned long value = *(unsigned long *) _p;
4685 if (value & ASCII_CHAR_MASK)
4686 break;
4687 _p += SIZEOF_LONG;
4688 }
4689 p = _p;
4690 if (_p == end)
4691 break;
4692 }
4693 if ((unsigned char)*p & 0x80)
4694 break;
4695 ++p;
4696 }
4697 memcpy(dest, start, p - start);
4698 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699}
Antoine Pitrouab868312009-01-10 15:40:25 +00004700
Victor Stinner785938e2011-12-11 20:09:03 +01004701PyObject *
4702PyUnicode_DecodeUTF8Stateful(const char *s,
4703 Py_ssize_t size,
4704 const char *errors,
4705 Py_ssize_t *consumed)
4706{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004707 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004708 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004709 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004710
4711 Py_ssize_t startinpos;
4712 Py_ssize_t endinpos;
4713 const char *errmsg = "";
4714 PyObject *errorHandler = NULL;
4715 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004716
4717 if (size == 0) {
4718 if (consumed)
4719 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004720 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004721 }
4722
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4724 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004725 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 *consumed = 1;
4727 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004728 }
4729
Victor Stinner8f674cc2013-04-17 23:02:17 +02004730 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004731 writer.min_length = size;
4732 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004733 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004734
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004735 writer.pos = ascii_decode(s, end, writer.data);
4736 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737 while (s < end) {
4738 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004739 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004740 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004741 if (PyUnicode_IS_ASCII(writer.buffer))
4742 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004743 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004744 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004747 } else {
4748 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004749 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004750 }
4751
4752 switch (ch) {
4753 case 0:
4754 if (s == end || consumed)
4755 goto End;
4756 errmsg = "unexpected end of data";
4757 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004758 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004759 break;
4760 case 1:
4761 errmsg = "invalid start byte";
4762 startinpos = s - starts;
4763 endinpos = startinpos + 1;
4764 break;
4765 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004766 case 3:
4767 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768 errmsg = "invalid continuation byte";
4769 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004770 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 break;
4772 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004773 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774 goto onError;
4775 continue;
4776 }
4777
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004778 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 errors, &errorHandler,
4780 "utf-8", errmsg,
4781 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004782 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004783 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004784 }
4785
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787 if (consumed)
4788 *consumed = s - starts;
4789
4790 Py_XDECREF(errorHandler);
4791 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004792 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004793
4794onError:
4795 Py_XDECREF(errorHandler);
4796 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004797 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004799}
4800
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004801#ifdef __APPLE__
4802
4803/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004804 used to decode the command line arguments on Mac OS X.
4805
4806 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004807 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004808
4809wchar_t*
4810_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4811{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004812 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004813 wchar_t *unicode;
4814 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004815
4816 /* Note: size will always be longer than the resulting Unicode
4817 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004818 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004819 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004820 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821 if (!unicode)
4822 return NULL;
4823
4824 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004825 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004827 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004830 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004831#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004832 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 if (ch > 0xFF) {
4835#if SIZEOF_WCHAR_T == 4
4836 assert(0);
4837#else
4838 assert(Py_UNICODE_IS_SURROGATE(ch));
4839 /* compute and append the two surrogates: */
4840 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4841 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4842#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004843 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004844 else {
4845 if (!ch && s == e)
4846 break;
4847 /* surrogateescape */
4848 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4849 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004850 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004852 return unicode;
4853}
4854
4855#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004857/* Primary internal function which creates utf8 encoded bytes objects.
4858
4859 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004860 and allocate exactly as much space needed at the end. Else allocate the
4861 maximum possible needed (4 result bytes per Unicode character), and return
4862 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004863*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004864PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004865_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866{
Victor Stinner6099a032011-12-18 14:22:26 +01004867 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868 void *data;
4869 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004871 if (!PyUnicode_Check(unicode)) {
4872 PyErr_BadArgument();
4873 return NULL;
4874 }
4875
4876 if (PyUnicode_READY(unicode) == -1)
4877 return NULL;
4878
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004879 if (PyUnicode_UTF8(unicode))
4880 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4881 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882
4883 kind = PyUnicode_KIND(unicode);
4884 data = PyUnicode_DATA(unicode);
4885 size = PyUnicode_GET_LENGTH(unicode);
4886
Benjamin Petersonead6b532011-12-20 17:23:42 -06004887 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004888 default:
4889 assert(0);
4890 case PyUnicode_1BYTE_KIND:
4891 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4892 assert(!PyUnicode_IS_ASCII(unicode));
4893 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4894 case PyUnicode_2BYTE_KIND:
4895 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4896 case PyUnicode_4BYTE_KIND:
4897 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004898 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899}
4900
Alexander Belopolsky40018472011-02-26 01:02:56 +00004901PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4903 Py_ssize_t size,
4904 const char *errors)
4905{
4906 PyObject *v, *unicode;
4907
4908 unicode = PyUnicode_FromUnicode(s, size);
4909 if (unicode == NULL)
4910 return NULL;
4911 v = _PyUnicode_AsUTF8String(unicode, errors);
4912 Py_DECREF(unicode);
4913 return v;
4914}
4915
4916PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004917PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004919 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920}
4921
Walter Dörwald41980ca2007-08-16 21:55:45 +00004922/* --- UTF-32 Codec ------------------------------------------------------- */
4923
4924PyObject *
4925PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 Py_ssize_t size,
4927 const char *errors,
4928 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929{
4930 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4931}
4932
4933PyObject *
4934PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 Py_ssize_t size,
4936 const char *errors,
4937 int *byteorder,
4938 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939{
4940 const char *starts = s;
4941 Py_ssize_t startinpos;
4942 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004943 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004944 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004945 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004946 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947 PyObject *errorHandler = NULL;
4948 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004949
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950 q = (unsigned char *)s;
4951 e = q + size;
4952
4953 if (byteorder)
4954 bo = *byteorder;
4955
4956 /* Check for BOM marks (U+FEFF) in the input and adjust current
4957 byte order setting accordingly. In native mode, the leading BOM
4958 mark is skipped, in all other modes, it is copied to the output
4959 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004960 if (bo == 0 && size >= 4) {
4961 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4962 if (bom == 0x0000FEFF) {
4963 bo = -1;
4964 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004966 else if (bom == 0xFFFE0000) {
4967 bo = 1;
4968 q += 4;
4969 }
4970 if (byteorder)
4971 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972 }
4973
Victor Stinnere64322e2012-10-30 23:12:47 +01004974 if (q == e) {
4975 if (consumed)
4976 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004977 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004978 }
4979
Victor Stinnere64322e2012-10-30 23:12:47 +01004980#ifdef WORDS_BIGENDIAN
4981 le = bo < 0;
4982#else
4983 le = bo <= 0;
4984#endif
4985
Victor Stinner8f674cc2013-04-17 23:02:17 +02004986 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004987 writer.min_length = (e - q + 3) / 4;
4988 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004989 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004990
Victor Stinnere64322e2012-10-30 23:12:47 +01004991 while (1) {
4992 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004994
Victor Stinnere64322e2012-10-30 23:12:47 +01004995 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 enum PyUnicode_Kind kind = writer.kind;
4997 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004998 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005000 if (le) {
5001 do {
5002 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5003 if (ch > maxch)
5004 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005005 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005006 q += 4;
5007 } while (q <= last);
5008 }
5009 else {
5010 do {
5011 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5012 if (ch > maxch)
5013 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005014 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005015 q += 4;
5016 } while (q <= last);
5017 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005018 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005019 }
5020
5021 if (ch <= maxch) {
5022 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005024 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005026 startinpos = ((const char *)q) - starts;
5027 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005029 else {
5030 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005031 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005032 goto onError;
5033 q += 4;
5034 continue;
5035 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005037 startinpos = ((const char *)q) - starts;
5038 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005040
5041 /* The remaining input chars are ignored if the callback
5042 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005043 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 errors, &errorHandler,
5045 "utf32", errmsg,
5046 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005047 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005049 }
5050
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053
Walter Dörwald41980ca2007-08-16 21:55:45 +00005054 Py_XDECREF(errorHandler);
5055 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005056 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005059 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005060 Py_XDECREF(errorHandler);
5061 Py_XDECREF(exc);
5062 return NULL;
5063}
5064
5065PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005066_PyUnicode_EncodeUTF32(PyObject *str,
5067 const char *errors,
5068 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005070 int kind;
5071 void *data;
5072 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005073 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005074 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005075 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005077#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 int iorder[] = {0, 1, 2, 3};
5079#else
5080 int iorder[] = {3, 2, 1, 0};
5081#endif
5082
Benjamin Peterson29060642009-01-31 22:14:21 +00005083#define STORECHAR(CH) \
5084 do { \
5085 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5086 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5087 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5088 p[iorder[0]] = (CH) & 0xff; \
5089 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090 } while(0)
5091
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005092 if (!PyUnicode_Check(str)) {
5093 PyErr_BadArgument();
5094 return NULL;
5095 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005096 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005097 return NULL;
5098 kind = PyUnicode_KIND(str);
5099 data = PyUnicode_DATA(str);
5100 len = PyUnicode_GET_LENGTH(str);
5101
5102 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005103 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005105 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106 if (v == NULL)
5107 return NULL;
5108
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005109 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005112 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005113 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114
5115 if (byteorder == -1) {
5116 /* force LE */
5117 iorder[0] = 0;
5118 iorder[1] = 1;
5119 iorder[2] = 2;
5120 iorder[3] = 3;
5121 }
5122 else if (byteorder == 1) {
5123 /* force BE */
5124 iorder[0] = 3;
5125 iorder[1] = 2;
5126 iorder[2] = 1;
5127 iorder[3] = 0;
5128 }
5129
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005130 for (i = 0; i < len; i++)
5131 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005132
5133 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005134 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005135#undef STORECHAR
5136}
5137
Alexander Belopolsky40018472011-02-26 01:02:56 +00005138PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005139PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5140 Py_ssize_t size,
5141 const char *errors,
5142 int byteorder)
5143{
5144 PyObject *result;
5145 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5146 if (tmp == NULL)
5147 return NULL;
5148 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5149 Py_DECREF(tmp);
5150 return result;
5151}
5152
5153PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005154PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005155{
Victor Stinnerb960b342011-11-20 19:12:52 +01005156 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005157}
5158
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159/* --- UTF-16 Codec ------------------------------------------------------- */
5160
Tim Peters772747b2001-08-09 22:21:55 +00005161PyObject *
5162PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 Py_ssize_t size,
5164 const char *errors,
5165 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166{
Walter Dörwald69652032004-09-07 20:24:22 +00005167 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5168}
5169
5170PyObject *
5171PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 Py_ssize_t size,
5173 const char *errors,
5174 int *byteorder,
5175 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005176{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005178 Py_ssize_t startinpos;
5179 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005180 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005181 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005182 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005183 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005184 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 PyObject *errorHandler = NULL;
5186 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187
Tim Peters772747b2001-08-09 22:21:55 +00005188 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005189 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190
5191 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005192 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005194 /* Check for BOM marks (U+FEFF) in the input and adjust current
5195 byte order setting accordingly. In native mode, the leading BOM
5196 mark is skipped, in all other modes, it is copied to the output
5197 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005198 if (bo == 0 && size >= 2) {
5199 const Py_UCS4 bom = (q[1] << 8) | q[0];
5200 if (bom == 0xFEFF) {
5201 q += 2;
5202 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005204 else if (bom == 0xFFFE) {
5205 q += 2;
5206 bo = 1;
5207 }
5208 if (byteorder)
5209 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211
Antoine Pitrou63065d72012-05-15 23:48:04 +02005212 if (q == e) {
5213 if (consumed)
5214 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005215 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005216 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005217
Christian Heimes743e0cd2012-10-17 23:52:17 +02005218#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005219 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005220#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005221 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005222#endif
Tim Peters772747b2001-08-09 22:21:55 +00005223
Antoine Pitrou63065d72012-05-15 23:48:04 +02005224 /* Note: size will always be longer than the resulting Unicode
5225 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005226 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005227 writer.min_length = (e - q + 1) / 2;
5228 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005229 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005230
Antoine Pitrou63065d72012-05-15 23:48:04 +02005231 while (1) {
5232 Py_UCS4 ch = 0;
5233 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005234 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005235 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005236 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005237 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005238 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005239 native_ordering);
5240 else
5241 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005242 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005243 native_ordering);
5244 } else if (kind == PyUnicode_2BYTE_KIND) {
5245 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005246 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005247 native_ordering);
5248 } else {
5249 assert(kind == PyUnicode_4BYTE_KIND);
5250 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005251 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005252 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005253 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005254 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005255
Antoine Pitrou63065d72012-05-15 23:48:04 +02005256 switch (ch)
5257 {
5258 case 0:
5259 /* remaining byte at the end? (size should be even) */
5260 if (q == e || consumed)
5261 goto End;
5262 errmsg = "truncated data";
5263 startinpos = ((const char *)q) - starts;
5264 endinpos = ((const char *)e) - starts;
5265 break;
5266 /* The remaining input chars are ignored if the callback
5267 chooses to skip the input */
5268 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005269 q -= 2;
5270 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005271 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005272 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005273 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005274 endinpos = ((const char *)e) - starts;
5275 break;
5276 case 2:
5277 errmsg = "illegal encoding";
5278 startinpos = ((const char *)q) - 2 - starts;
5279 endinpos = startinpos + 2;
5280 break;
5281 case 3:
5282 errmsg = "illegal UTF-16 surrogate";
5283 startinpos = ((const char *)q) - 4 - starts;
5284 endinpos = startinpos + 2;
5285 break;
5286 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005287 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005288 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 continue;
5290 }
5291
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005292 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005293 errors,
5294 &errorHandler,
5295 "utf16", errmsg,
5296 &starts,
5297 (const char **)&e,
5298 &startinpos,
5299 &endinpos,
5300 &exc,
5301 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 }
5305
Antoine Pitrou63065d72012-05-15 23:48:04 +02005306End:
Walter Dörwald69652032004-09-07 20:24:22 +00005307 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005309
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005310 Py_XDECREF(errorHandler);
5311 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005315 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005316 Py_XDECREF(errorHandler);
5317 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 return NULL;
5319}
5320
Tim Peters772747b2001-08-09 22:21:55 +00005321PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005322_PyUnicode_EncodeUTF16(PyObject *str,
5323 const char *errors,
5324 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005326 enum PyUnicode_Kind kind;
5327 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005328 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005329 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005330 unsigned short *out;
5331 Py_ssize_t bytesize;
5332 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005333#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005334 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005335#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005336 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005337#endif
5338
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005339 if (!PyUnicode_Check(str)) {
5340 PyErr_BadArgument();
5341 return NULL;
5342 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005343 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005344 return NULL;
5345 kind = PyUnicode_KIND(str);
5346 data = PyUnicode_DATA(str);
5347 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005348
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005349 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005350 if (kind == PyUnicode_4BYTE_KIND) {
5351 const Py_UCS4 *in = (const Py_UCS4 *)data;
5352 const Py_UCS4 *end = in + len;
5353 while (in < end)
5354 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005355 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005356 }
5357 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005359 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005360 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 if (v == NULL)
5362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005364 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005365 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005366 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005368 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005369 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005370 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005371
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005372 switch (kind) {
5373 case PyUnicode_1BYTE_KIND: {
5374 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5375 break;
Tim Peters772747b2001-08-09 22:21:55 +00005376 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005377 case PyUnicode_2BYTE_KIND: {
5378 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5379 break;
Tim Peters772747b2001-08-09 22:21:55 +00005380 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005381 case PyUnicode_4BYTE_KIND: {
5382 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5383 break;
5384 }
5385 default:
5386 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005387 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005388
5389 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005390 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391}
5392
Alexander Belopolsky40018472011-02-26 01:02:56 +00005393PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005394PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5395 Py_ssize_t size,
5396 const char *errors,
5397 int byteorder)
5398{
5399 PyObject *result;
5400 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5401 if (tmp == NULL)
5402 return NULL;
5403 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5404 Py_DECREF(tmp);
5405 return result;
5406}
5407
5408PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005409PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005411 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412}
5413
5414/* --- Unicode Escape Codec ----------------------------------------------- */
5415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005416/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5417 if all the escapes in the string make it still a valid ASCII string.
5418 Returns -1 if any escapes were found which cause the string to
5419 pop out of ASCII range. Otherwise returns the length of the
5420 required buffer to hold the string.
5421 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005422static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005423length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5424{
5425 const unsigned char *p = (const unsigned char *)s;
5426 const unsigned char *end = p + size;
5427 Py_ssize_t length = 0;
5428
5429 if (size < 0)
5430 return -1;
5431
5432 for (; p < end; ++p) {
5433 if (*p > 127) {
5434 /* Non-ASCII */
5435 return -1;
5436 }
5437 else if (*p != '\\') {
5438 /* Normal character */
5439 ++length;
5440 }
5441 else {
5442 /* Backslash-escape, check next char */
5443 ++p;
5444 /* Escape sequence reaches till end of string or
5445 non-ASCII follow-up. */
5446 if (p >= end || *p > 127)
5447 return -1;
5448 switch (*p) {
5449 case '\n':
5450 /* backslash + \n result in zero characters */
5451 break;
5452 case '\\': case '\'': case '\"':
5453 case 'b': case 'f': case 't':
5454 case 'n': case 'r': case 'v': case 'a':
5455 ++length;
5456 break;
5457 case '0': case '1': case '2': case '3':
5458 case '4': case '5': case '6': case '7':
5459 case 'x': case 'u': case 'U': case 'N':
5460 /* these do not guarantee ASCII characters */
5461 return -1;
5462 default:
5463 /* count the backslash + the other character */
5464 length += 2;
5465 }
5466 }
5467 }
5468 return length;
5469}
5470
Fredrik Lundh06d12682001-01-24 07:59:11 +00005471static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005472
Alexander Belopolsky40018472011-02-26 01:02:56 +00005473PyObject *
5474PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005475 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005476 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005479 Py_ssize_t startinpos;
5480 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005481 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005483 char* message;
5484 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005485 PyObject *errorHandler = NULL;
5486 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005487 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005488
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005489 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005490 if (len == 0)
5491 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005492
5493 /* After length_of_escaped_ascii_string() there are two alternatives,
5494 either the string is pure ASCII with named escapes like \n, etc.
5495 and we determined it's exact size (common case)
5496 or it contains \x, \u, ... escape sequences. then we create a
5497 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005498 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005499 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005500 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005501 }
5502 else {
5503 /* Escaped strings will always be longer than the resulting
5504 Unicode string, so we start with size here and then reduce the
5505 length after conversion to the true value.
5506 (but if the error callback returns a long replacement string
5507 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005508 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509 }
5510
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005512 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005514
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 while (s < end) {
5516 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005517 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519
5520 /* Non-escape characters are interpreted as Unicode ordinals */
5521 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005522 x = (unsigned char)*s;
5523 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005524 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005525 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 continue;
5527 }
5528
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005529 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 /* \ - Escapes */
5531 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005532 c = *s++;
5533 if (s > end)
5534 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005535
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005536 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005539#define WRITECHAR(ch) \
5540 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005541 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005542 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005543 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005544
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005546 case '\\': WRITECHAR('\\'); break;
5547 case '\'': WRITECHAR('\''); break;
5548 case '\"': WRITECHAR('\"'); break;
5549 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005551 case 'f': WRITECHAR('\014'); break;
5552 case 't': WRITECHAR('\t'); break;
5553 case 'n': WRITECHAR('\n'); break;
5554 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005555 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005556 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005558 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 case '0': case '1': case '2': case '3':
5562 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005563 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005564 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005565 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005566 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005567 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005569 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 break;
5571
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 /* hex escapes */
5573 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005575 digits = 2;
5576 message = "truncated \\xXX escape";
5577 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005581 digits = 4;
5582 message = "truncated \\uXXXX escape";
5583 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005586 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005587 digits = 8;
5588 message = "truncated \\UXXXXXXXX escape";
5589 hexescape:
5590 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005591 if (end - s < digits) {
5592 /* count only hex digits */
5593 for (; s < end; ++s) {
5594 c = (unsigned char)*s;
5595 if (!Py_ISXDIGIT(c))
5596 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005597 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005598 goto error;
5599 }
5600 for (; digits--; ++s) {
5601 c = (unsigned char)*s;
5602 if (!Py_ISXDIGIT(c))
5603 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005604 chr = (chr<<4) & ~0xF;
5605 if (c >= '0' && c <= '9')
5606 chr += c - '0';
5607 else if (c >= 'a' && c <= 'f')
5608 chr += 10 + c - 'a';
5609 else
5610 chr += 10 + c - 'A';
5611 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005612 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613 /* _decoding_error will have already written into the
5614 target buffer. */
5615 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005616 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005617 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005618 message = "illegal Unicode character";
5619 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005620 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005621 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005622 break;
5623
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005625 case 'N':
5626 message = "malformed \\N character escape";
5627 if (ucnhash_CAPI == NULL) {
5628 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005629 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5630 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005631 if (ucnhash_CAPI == NULL)
5632 goto ucnhashError;
5633 }
5634 if (*s == '{') {
5635 const char *start = s+1;
5636 /* look for the closing brace */
5637 while (*s != '}' && s < end)
5638 s++;
5639 if (s > start && s < end && *s == '}') {
5640 /* found a name. look it up in the unicode database */
5641 message = "unknown Unicode character name";
5642 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005643 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005644 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005645 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005646 goto store;
5647 }
5648 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005649 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005650
5651 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005652 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 message = "\\ at end of string";
5654 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005655 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005656 }
5657 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005658 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005659 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005660 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005661 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005663 continue;
5664
5665 error:
5666 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005667 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005668 errors, &errorHandler,
5669 "unicodeescape", message,
5670 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005671 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005672 goto onError;
5673 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005676
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005677 Py_XDECREF(errorHandler);
5678 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005679 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005680
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005682 PyErr_SetString(
5683 PyExc_UnicodeError,
5684 "\\N escapes not supported (can't load unicodedata module)"
5685 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005686 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687 Py_XDECREF(errorHandler);
5688 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005689 return NULL;
5690
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005692 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 Py_XDECREF(errorHandler);
5694 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 return NULL;
5696}
5697
5698/* Return a Unicode-Escape string version of the Unicode object.
5699
5700 If quotes is true, the string is enclosed in u"" or u'' quotes as
5701 appropriate.
5702
5703*/
5704
Alexander Belopolsky40018472011-02-26 01:02:56 +00005705PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005706PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005708 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005709 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005711 int kind;
5712 void *data;
5713 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714
Ezio Melottie7f90372012-10-05 03:33:31 +03005715 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005716 escape.
5717
Ezio Melottie7f90372012-10-05 03:33:31 +03005718 For UCS1 strings it's '\xxx', 4 bytes per source character.
5719 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5720 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005721 */
5722
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005723 if (!PyUnicode_Check(unicode)) {
5724 PyErr_BadArgument();
5725 return NULL;
5726 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005727 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005728 return NULL;
5729 len = PyUnicode_GET_LENGTH(unicode);
5730 kind = PyUnicode_KIND(unicode);
5731 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005732 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005733 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5734 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5735 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5736 }
5737
5738 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005739 return PyBytes_FromStringAndSize(NULL, 0);
5740
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005741 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005743
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005744 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005746 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 if (repr == NULL)
5749 return NULL;
5750
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005751 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005753 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005754 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005755
Walter Dörwald79e913e2007-05-12 11:08:06 +00005756 /* Escape backslashes */
5757 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 *p++ = '\\';
5759 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005760 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005761 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005762
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005763 /* Map 21-bit characters to '\U00xxxxxx' */
5764 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005765 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005766 *p++ = '\\';
5767 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005768 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5769 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5770 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5771 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5772 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5773 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5774 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5775 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005777 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005778
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005780 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 *p++ = '\\';
5782 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005783 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5784 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5785 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5786 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005788
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005789 /* Map special whitespace to '\t', \n', '\r' */
5790 else if (ch == '\t') {
5791 *p++ = '\\';
5792 *p++ = 't';
5793 }
5794 else if (ch == '\n') {
5795 *p++ = '\\';
5796 *p++ = 'n';
5797 }
5798 else if (ch == '\r') {
5799 *p++ = '\\';
5800 *p++ = 'r';
5801 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005802
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005803 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005804 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005806 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005807 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5808 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005809 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005810
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 /* Copy everything else as-is */
5812 else
5813 *p++ = (char) ch;
5814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005816 assert(p - PyBytes_AS_STRING(repr) > 0);
5817 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5818 return NULL;
5819 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820}
5821
Alexander Belopolsky40018472011-02-26 01:02:56 +00005822PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005823PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5824 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005826 PyObject *result;
5827 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5828 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005830 result = PyUnicode_AsUnicodeEscapeString(tmp);
5831 Py_DECREF(tmp);
5832 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833}
5834
5835/* --- Raw Unicode Escape Codec ------------------------------------------- */
5836
Alexander Belopolsky40018472011-02-26 01:02:56 +00005837PyObject *
5838PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005839 Py_ssize_t size,
5840 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005842 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005843 Py_ssize_t startinpos;
5844 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 const char *end;
5847 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 PyObject *errorHandler = NULL;
5849 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005850
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005851 if (size == 0)
5852 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 /* Escaped strings will always be longer than the resulting
5855 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 length after conversion to the true value. (But decoding error
5857 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005858 _PyUnicodeWriter_Init(&writer);
5859 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005860
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 end = s + size;
5862 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 unsigned char c;
5864 Py_UCS4 x;
5865 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005866 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 /* Non-escape characters are interpreted as Unicode ordinals */
5869 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005870 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005871 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005872 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 startinpos = s-starts;
5876
5877 /* \u-escapes are only interpreted iff the number of leading
5878 backslashes if odd */
5879 bs = s;
5880 for (;s < end;) {
5881 if (*s != '\\')
5882 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005883 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005884 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005885 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 }
5887 if (((s - bs) & 1) == 0 ||
5888 s >= end ||
5889 (*s != 'u' && *s != 'U')) {
5890 continue;
5891 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005892 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 count = *s=='u' ? 4 : 8;
5894 s++;
5895
5896 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 for (x = 0, i = 0; i < count; ++i, ++s) {
5898 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005899 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005901 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 errors, &errorHandler,
5903 "rawunicodeescape", "truncated \\uXXXX",
5904 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 goto onError;
5907 goto nextByte;
5908 }
5909 x = (x<<4) & ~0xF;
5910 if (c >= '0' && c <= '9')
5911 x += c - '0';
5912 else if (c >= 'a' && c <= 'f')
5913 x += 10 + c - 'a';
5914 else
5915 x += 10 + c - 'A';
5916 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005917 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005918 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005919 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005920 }
5921 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005922 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005923 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005924 errors, &errorHandler,
5925 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005927 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 nextByte:
5931 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005933 Py_XDECREF(errorHandler);
5934 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005935 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005936
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005938 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005939 Py_XDECREF(errorHandler);
5940 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 return NULL;
5942}
5943
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005944
Alexander Belopolsky40018472011-02-26 01:02:56 +00005945PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005946PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005948 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 char *p;
5950 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005951 Py_ssize_t expandsize, pos;
5952 int kind;
5953 void *data;
5954 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 if (!PyUnicode_Check(unicode)) {
5957 PyErr_BadArgument();
5958 return NULL;
5959 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005960 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 return NULL;
5962 kind = PyUnicode_KIND(unicode);
5963 data = PyUnicode_DATA(unicode);
5964 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005965 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5966 bytes, and 1 byte characters 4. */
5967 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005968
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005969 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005971
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 if (repr == NULL)
5974 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005975 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005976 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005978 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005979 for (pos = 0; pos < len; pos++) {
5980 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 /* Map 32-bit characters to '\Uxxxxxxxx' */
5982 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005983 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005984 *p++ = '\\';
5985 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005986 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5987 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5988 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5989 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5990 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5991 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5992 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5993 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005994 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005996 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 *p++ = '\\';
5998 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005999 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6000 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6001 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6002 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 /* Copy everything else as-is */
6005 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 *p++ = (char) ch;
6007 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006008
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009 assert(p > q);
6010 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006011 return NULL;
6012 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013}
6014
Alexander Belopolsky40018472011-02-26 01:02:56 +00006015PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006016PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6017 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006019 PyObject *result;
6020 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6021 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006022 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6024 Py_DECREF(tmp);
6025 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026}
6027
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006028/* --- Unicode Internal Codec ------------------------------------------- */
6029
Alexander Belopolsky40018472011-02-26 01:02:56 +00006030PyObject *
6031_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006032 Py_ssize_t size,
6033 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006034{
6035 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006036 Py_ssize_t startinpos;
6037 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006038 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006039 const char *end;
6040 const char *reason;
6041 PyObject *errorHandler = NULL;
6042 PyObject *exc = NULL;
6043
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006044 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006045 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006046 1))
6047 return NULL;
6048
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006049 if (size == 0)
6050 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006051
Victor Stinner8f674cc2013-04-17 23:02:17 +02006052 _PyUnicodeWriter_Init(&writer);
6053 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6054 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006056 }
6057 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006058
Victor Stinner8f674cc2013-04-17 23:02:17 +02006059 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006060 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006061 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006062 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006063 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006064 endinpos = end-starts;
6065 reason = "truncated input";
6066 goto error;
6067 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006068 /* We copy the raw representation one byte at a time because the
6069 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006070 ((char *) &uch)[0] = s[0];
6071 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006072#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006073 ((char *) &uch)[2] = s[2];
6074 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006075#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006076 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006077#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006078 /* We have to sanity check the raw data, otherwise doom looms for
6079 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006080 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006081 endinpos = s - starts + Py_UNICODE_SIZE;
6082 reason = "illegal code point (> 0x10FFFF)";
6083 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006084 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006085#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006086 s += Py_UNICODE_SIZE;
6087#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006088 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006089 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006090 Py_UNICODE uch2;
6091 ((char *) &uch2)[0] = s[0];
6092 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006093 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006094 {
Victor Stinner551ac952011-11-29 22:58:13 +01006095 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006096 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006097 }
6098 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006099#endif
6100
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006101 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006102 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006103 continue;
6104
6105 error:
6106 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006107 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006108 errors, &errorHandler,
6109 "unicode_internal", reason,
6110 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006111 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006112 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006113 }
6114
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006115 Py_XDECREF(errorHandler);
6116 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006117 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006118
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006120 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121 Py_XDECREF(errorHandler);
6122 Py_XDECREF(exc);
6123 return NULL;
6124}
6125
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126/* --- Latin-1 Codec ------------------------------------------------------ */
6127
Alexander Belopolsky40018472011-02-26 01:02:56 +00006128PyObject *
6129PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006130 Py_ssize_t size,
6131 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006134 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135}
6136
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006138static void
6139make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006140 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006141 PyObject *unicode,
6142 Py_ssize_t startpos, Py_ssize_t endpos,
6143 const char *reason)
6144{
6145 if (*exceptionObject == NULL) {
6146 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006147 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006148 encoding, unicode, startpos, endpos, reason);
6149 }
6150 else {
6151 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6152 goto onError;
6153 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6154 goto onError;
6155 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6156 goto onError;
6157 return;
6158 onError:
6159 Py_DECREF(*exceptionObject);
6160 *exceptionObject = NULL;
6161 }
6162}
6163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006165static void
6166raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006167 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006168 PyObject *unicode,
6169 Py_ssize_t startpos, Py_ssize_t endpos,
6170 const char *reason)
6171{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006172 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006173 encoding, unicode, startpos, endpos, reason);
6174 if (*exceptionObject != NULL)
6175 PyCodec_StrictErrors(*exceptionObject);
6176}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006177
6178/* error handling callback helper:
6179 build arguments, call the callback and check the arguments,
6180 put the result into newpos and return the replacement string, which
6181 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006182static PyObject *
6183unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006184 PyObject **errorHandler,
6185 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006186 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006187 Py_ssize_t startpos, Py_ssize_t endpos,
6188 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006189{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006190 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006191 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 PyObject *restuple;
6193 PyObject *resunicode;
6194
6195 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 }
6200
Benjamin Petersonbac79492012-01-14 13:34:47 -05006201 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006202 return NULL;
6203 len = PyUnicode_GET_LENGTH(unicode);
6204
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006205 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006206 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209
6210 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006212 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006215 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 Py_DECREF(restuple);
6217 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006218 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006219 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 &resunicode, newpos)) {
6221 Py_DECREF(restuple);
6222 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006223 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006224 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6225 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6226 Py_DECREF(restuple);
6227 return NULL;
6228 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006229 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006230 *newpos = len + *newpos;
6231 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6233 Py_DECREF(restuple);
6234 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006235 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006236 Py_INCREF(resunicode);
6237 Py_DECREF(restuple);
6238 return resunicode;
6239}
6240
Alexander Belopolsky40018472011-02-26 01:02:56 +00006241static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006242unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006243 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006244 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006245{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006246 /* input state */
6247 Py_ssize_t pos=0, size;
6248 int kind;
6249 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006250 /* output object */
6251 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006252 /* pointer into the output */
6253 char *str;
6254 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006255 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006256 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6257 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258 PyObject *errorHandler = NULL;
6259 PyObject *exc = NULL;
6260 /* the following variable is used for caching string comparisons
6261 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6262 int known_errorHandler = -1;
6263
Benjamin Petersonbac79492012-01-14 13:34:47 -05006264 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006265 return NULL;
6266 size = PyUnicode_GET_LENGTH(unicode);
6267 kind = PyUnicode_KIND(unicode);
6268 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006269 /* allocate enough for a simple encoding without
6270 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006271 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006272 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006273 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006274 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006275 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006276 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277 ressize = size;
6278
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006279 while (pos < size) {
6280 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 /* can we encode this? */
6283 if (c<limit) {
6284 /* no overflow check, because we know that the space is enough */
6285 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006286 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006287 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 Py_ssize_t requiredsize;
6290 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006291 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006293 Py_ssize_t collstart = pos;
6294 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006296 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 ++collend;
6298 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6299 if (known_errorHandler==-1) {
6300 if ((errors==NULL) || (!strcmp(errors, "strict")))
6301 known_errorHandler = 1;
6302 else if (!strcmp(errors, "replace"))
6303 known_errorHandler = 2;
6304 else if (!strcmp(errors, "ignore"))
6305 known_errorHandler = 3;
6306 else if (!strcmp(errors, "xmlcharrefreplace"))
6307 known_errorHandler = 4;
6308 else
6309 known_errorHandler = 0;
6310 }
6311 switch (known_errorHandler) {
6312 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006313 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 goto onError;
6315 case 2: /* replace */
6316 while (collstart++<collend)
6317 *str++ = '?'; /* fall through */
6318 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006319 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 break;
6321 case 4: /* xmlcharrefreplace */
6322 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006323 /* determine replacement size */
6324 for (i = collstart, repsize = 0; i < collend; ++i) {
6325 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6326 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006330 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006332 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006334 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006336 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006338 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006339 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006341 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006343 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 if (requiredsize > ressize) {
6345 if (requiredsize<2*ressize)
6346 requiredsize = 2*ressize;
6347 if (_PyBytes_Resize(&res, requiredsize))
6348 goto onError;
6349 str = PyBytes_AS_STRING(res) + respos;
6350 ressize = requiredsize;
6351 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006352 /* generate replacement */
6353 for (i = collstart; i < collend; ++i) {
6354 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006356 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 break;
6358 default:
6359 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006360 encoding, reason, unicode, &exc,
6361 collstart, collend, &newpos);
6362 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006363 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006365 if (PyBytes_Check(repunicode)) {
6366 /* Directly copy bytes result to output. */
6367 repsize = PyBytes_Size(repunicode);
6368 if (repsize > 1) {
6369 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006370 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006371 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6372 Py_DECREF(repunicode);
6373 goto onError;
6374 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006375 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006376 ressize += repsize-1;
6377 }
6378 memcpy(str, PyBytes_AsString(repunicode), repsize);
6379 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006380 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006381 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006382 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006383 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 /* need more space? (at least enough for what we
6385 have+the replacement+the rest of the string, so
6386 we won't have to check space for encodable characters) */
6387 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 repsize = PyUnicode_GET_LENGTH(repunicode);
6389 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 if (requiredsize > ressize) {
6391 if (requiredsize<2*ressize)
6392 requiredsize = 2*ressize;
6393 if (_PyBytes_Resize(&res, requiredsize)) {
6394 Py_DECREF(repunicode);
6395 goto onError;
6396 }
6397 str = PyBytes_AS_STRING(res) + respos;
6398 ressize = requiredsize;
6399 }
6400 /* check if there is anything unencodable in the replacement
6401 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006402 for (i = 0; repsize-->0; ++i, ++str) {
6403 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006405 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006406 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 Py_DECREF(repunicode);
6408 goto onError;
6409 }
6410 *str = (char)c;
6411 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006413 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006415 }
6416 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006417 /* Resize if we allocated to much */
6418 size = str - PyBytes_AS_STRING(res);
6419 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006420 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006421 if (_PyBytes_Resize(&res, size) < 0)
6422 goto onError;
6423 }
6424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 Py_XDECREF(errorHandler);
6426 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006427 return res;
6428
6429 onError:
6430 Py_XDECREF(res);
6431 Py_XDECREF(errorHandler);
6432 Py_XDECREF(exc);
6433 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434}
6435
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006436/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006437PyObject *
6438PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006439 Py_ssize_t size,
6440 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442 PyObject *result;
6443 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6444 if (unicode == NULL)
6445 return NULL;
6446 result = unicode_encode_ucs1(unicode, errors, 256);
6447 Py_DECREF(unicode);
6448 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449}
6450
Alexander Belopolsky40018472011-02-26 01:02:56 +00006451PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006452_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453{
6454 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 PyErr_BadArgument();
6456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006458 if (PyUnicode_READY(unicode) == -1)
6459 return NULL;
6460 /* Fast path: if it is a one-byte string, construct
6461 bytes object directly. */
6462 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6463 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6464 PyUnicode_GET_LENGTH(unicode));
6465 /* Non-Latin-1 characters present. Defer to above function to
6466 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006468}
6469
6470PyObject*
6471PyUnicode_AsLatin1String(PyObject *unicode)
6472{
6473 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474}
6475
6476/* --- 7-bit ASCII Codec -------------------------------------------------- */
6477
Alexander Belopolsky40018472011-02-26 01:02:56 +00006478PyObject *
6479PyUnicode_DecodeASCII(const char *s,
6480 Py_ssize_t size,
6481 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006484 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006485 int kind;
6486 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006487 Py_ssize_t startinpos;
6488 Py_ssize_t endinpos;
6489 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490 const char *e;
6491 PyObject *errorHandler = NULL;
6492 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006493
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006495 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006496
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006498 if (size == 1 && (unsigned char)s[0] < 128)
6499 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006500
Victor Stinner8f674cc2013-04-17 23:02:17 +02006501 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006502 writer.min_length = size;
6503 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006504 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006507 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006508 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006509 writer.pos = outpos;
6510 if (writer.pos == size)
6511 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006512
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006513 s += writer.pos;
6514 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006515 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006516 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006518 PyUnicode_WRITE(kind, data, writer.pos, c);
6519 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 ++s;
6521 }
6522 else {
6523 startinpos = s-starts;
6524 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006525 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 errors, &errorHandler,
6527 "ascii", "ordinal not in range(128)",
6528 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006529 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006531 kind = writer.kind;
6532 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006535 Py_XDECREF(errorHandler);
6536 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006537 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006538
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006540 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 Py_XDECREF(errorHandler);
6542 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 return NULL;
6544}
6545
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006546/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006547PyObject *
6548PyUnicode_EncodeASCII(const Py_UNICODE *p,
6549 Py_ssize_t size,
6550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 PyObject *result;
6553 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6554 if (unicode == NULL)
6555 return NULL;
6556 result = unicode_encode_ucs1(unicode, errors, 128);
6557 Py_DECREF(unicode);
6558 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559}
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006562_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563{
6564 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 PyErr_BadArgument();
6566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006568 if (PyUnicode_READY(unicode) == -1)
6569 return NULL;
6570 /* Fast path: if it is an ASCII-only string, construct bytes object
6571 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006572 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006573 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6574 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006575 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006576}
6577
6578PyObject *
6579PyUnicode_AsASCIIString(PyObject *unicode)
6580{
6581 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582}
6583
Victor Stinner99b95382011-07-04 14:23:54 +02006584#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006585
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006586/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006587
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006588#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006589#define NEED_RETRY
6590#endif
6591
Victor Stinner3a50e702011-10-18 21:21:00 +02006592#ifndef WC_ERR_INVALID_CHARS
6593# define WC_ERR_INVALID_CHARS 0x0080
6594#endif
6595
6596static char*
6597code_page_name(UINT code_page, PyObject **obj)
6598{
6599 *obj = NULL;
6600 if (code_page == CP_ACP)
6601 return "mbcs";
6602 if (code_page == CP_UTF7)
6603 return "CP_UTF7";
6604 if (code_page == CP_UTF8)
6605 return "CP_UTF8";
6606
6607 *obj = PyBytes_FromFormat("cp%u", code_page);
6608 if (*obj == NULL)
6609 return NULL;
6610 return PyBytes_AS_STRING(*obj);
6611}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006612
Alexander Belopolsky40018472011-02-26 01:02:56 +00006613static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006614is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006615{
6616 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006617 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006618
Victor Stinner3a50e702011-10-18 21:21:00 +02006619 if (!IsDBCSLeadByteEx(code_page, *curr))
6620 return 0;
6621
6622 prev = CharPrevExA(code_page, s, curr, 0);
6623 if (prev == curr)
6624 return 1;
6625 /* FIXME: This code is limited to "true" double-byte encodings,
6626 as it assumes an incomplete character consists of a single
6627 byte. */
6628 if (curr - prev == 2)
6629 return 1;
6630 if (!IsDBCSLeadByteEx(code_page, *prev))
6631 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006632 return 0;
6633}
6634
Victor Stinner3a50e702011-10-18 21:21:00 +02006635static DWORD
6636decode_code_page_flags(UINT code_page)
6637{
6638 if (code_page == CP_UTF7) {
6639 /* The CP_UTF7 decoder only supports flags=0 */
6640 return 0;
6641 }
6642 else
6643 return MB_ERR_INVALID_CHARS;
6644}
6645
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006646/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006647 * Decode a byte string from a Windows code page into unicode object in strict
6648 * mode.
6649 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006650 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6651 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006652 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006653static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006654decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006655 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006656 const char *in,
6657 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006658{
Victor Stinner3a50e702011-10-18 21:21:00 +02006659 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006660 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006661 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006662
6663 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006664 assert(insize > 0);
6665 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6666 if (outsize <= 0)
6667 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006668
6669 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006671 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006672 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 if (*v == NULL)
6674 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006675 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006676 }
6677 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006679 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006680 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006682 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006683 }
6684
6685 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006686 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6687 if (outsize <= 0)
6688 goto error;
6689 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006690
Victor Stinner3a50e702011-10-18 21:21:00 +02006691error:
6692 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6693 return -2;
6694 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006695 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006696}
6697
Victor Stinner3a50e702011-10-18 21:21:00 +02006698/*
6699 * Decode a byte string from a code page into unicode object with an error
6700 * handler.
6701 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006702 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006703 * UnicodeDecodeError exception and returns -1 on error.
6704 */
6705static int
6706decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006707 PyObject **v,
6708 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006709 const char *errors)
6710{
6711 const char *startin = in;
6712 const char *endin = in + size;
6713 const DWORD flags = decode_code_page_flags(code_page);
6714 /* Ideally, we should get reason from FormatMessage. This is the Windows
6715 2000 English version of the message. */
6716 const char *reason = "No mapping for the Unicode character exists "
6717 "in the target code page.";
6718 /* each step cannot decode more than 1 character, but a character can be
6719 represented as a surrogate pair */
6720 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006721 int insize;
6722 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006723 PyObject *errorHandler = NULL;
6724 PyObject *exc = NULL;
6725 PyObject *encoding_obj = NULL;
6726 char *encoding;
6727 DWORD err;
6728 int ret = -1;
6729
6730 assert(size > 0);
6731
6732 encoding = code_page_name(code_page, &encoding_obj);
6733 if (encoding == NULL)
6734 return -1;
6735
6736 if (errors == NULL || strcmp(errors, "strict") == 0) {
6737 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6738 UnicodeDecodeError. */
6739 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6740 if (exc != NULL) {
6741 PyCodec_StrictErrors(exc);
6742 Py_CLEAR(exc);
6743 }
6744 goto error;
6745 }
6746
6747 if (*v == NULL) {
6748 /* Create unicode object */
6749 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6750 PyErr_NoMemory();
6751 goto error;
6752 }
Victor Stinnerab595942011-12-17 04:59:06 +01006753 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006754 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006755 if (*v == NULL)
6756 goto error;
6757 startout = PyUnicode_AS_UNICODE(*v);
6758 }
6759 else {
6760 /* Extend unicode object */
6761 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6762 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6763 PyErr_NoMemory();
6764 goto error;
6765 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006766 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006767 goto error;
6768 startout = PyUnicode_AS_UNICODE(*v) + n;
6769 }
6770
6771 /* Decode the byte string character per character */
6772 out = startout;
6773 while (in < endin)
6774 {
6775 /* Decode a character */
6776 insize = 1;
6777 do
6778 {
6779 outsize = MultiByteToWideChar(code_page, flags,
6780 in, insize,
6781 buffer, Py_ARRAY_LENGTH(buffer));
6782 if (outsize > 0)
6783 break;
6784 err = GetLastError();
6785 if (err != ERROR_NO_UNICODE_TRANSLATION
6786 && err != ERROR_INSUFFICIENT_BUFFER)
6787 {
6788 PyErr_SetFromWindowsErr(0);
6789 goto error;
6790 }
6791 insize++;
6792 }
6793 /* 4=maximum length of a UTF-8 sequence */
6794 while (insize <= 4 && (in + insize) <= endin);
6795
6796 if (outsize <= 0) {
6797 Py_ssize_t startinpos, endinpos, outpos;
6798
6799 startinpos = in - startin;
6800 endinpos = startinpos + 1;
6801 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006802 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006803 errors, &errorHandler,
6804 encoding, reason,
6805 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006806 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006807 {
6808 goto error;
6809 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006810 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006811 }
6812 else {
6813 in += insize;
6814 memcpy(out, buffer, outsize * sizeof(wchar_t));
6815 out += outsize;
6816 }
6817 }
6818
6819 /* write a NUL character at the end */
6820 *out = 0;
6821
6822 /* Extend unicode object */
6823 outsize = out - startout;
6824 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006825 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006826 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006827 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006828
6829error:
6830 Py_XDECREF(encoding_obj);
6831 Py_XDECREF(errorHandler);
6832 Py_XDECREF(exc);
6833 return ret;
6834}
6835
Victor Stinner3a50e702011-10-18 21:21:00 +02006836static PyObject *
6837decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006838 const char *s, Py_ssize_t size,
6839 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006840{
Victor Stinner76a31a62011-11-04 00:05:13 +01006841 PyObject *v = NULL;
6842 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843
Victor Stinner3a50e702011-10-18 21:21:00 +02006844 if (code_page < 0) {
6845 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6846 return NULL;
6847 }
6848
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006849 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006851
Victor Stinner76a31a62011-11-04 00:05:13 +01006852 do
6853 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006854#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006855 if (size > INT_MAX) {
6856 chunk_size = INT_MAX;
6857 final = 0;
6858 done = 0;
6859 }
6860 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006862 {
6863 chunk_size = (int)size;
6864 final = (consumed == NULL);
6865 done = 1;
6866 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867
Victor Stinner76a31a62011-11-04 00:05:13 +01006868 /* Skip trailing lead-byte unless 'final' is set */
6869 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6870 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871
Victor Stinner76a31a62011-11-04 00:05:13 +01006872 if (chunk_size == 0 && done) {
6873 if (v != NULL)
6874 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006875 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006876 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006877
Victor Stinner76a31a62011-11-04 00:05:13 +01006878
6879 converted = decode_code_page_strict(code_page, &v,
6880 s, chunk_size);
6881 if (converted == -2)
6882 converted = decode_code_page_errors(code_page, &v,
6883 s, chunk_size,
6884 errors);
6885 assert(converted != 0);
6886
6887 if (converted < 0) {
6888 Py_XDECREF(v);
6889 return NULL;
6890 }
6891
6892 if (consumed)
6893 *consumed += converted;
6894
6895 s += converted;
6896 size -= converted;
6897 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006898
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006899 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900}
6901
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006903PyUnicode_DecodeCodePageStateful(int code_page,
6904 const char *s,
6905 Py_ssize_t size,
6906 const char *errors,
6907 Py_ssize_t *consumed)
6908{
6909 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6910}
6911
6912PyObject *
6913PyUnicode_DecodeMBCSStateful(const char *s,
6914 Py_ssize_t size,
6915 const char *errors,
6916 Py_ssize_t *consumed)
6917{
6918 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6919}
6920
6921PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyUnicode_DecodeMBCS(const char *s,
6923 Py_ssize_t size,
6924 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006925{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006926 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6927}
6928
Victor Stinner3a50e702011-10-18 21:21:00 +02006929static DWORD
6930encode_code_page_flags(UINT code_page, const char *errors)
6931{
6932 if (code_page == CP_UTF8) {
6933 if (winver.dwMajorVersion >= 6)
6934 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6935 and later */
6936 return WC_ERR_INVALID_CHARS;
6937 else
6938 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6939 return 0;
6940 }
6941 else if (code_page == CP_UTF7) {
6942 /* CP_UTF7 only supports flags=0 */
6943 return 0;
6944 }
6945 else {
6946 if (errors != NULL && strcmp(errors, "replace") == 0)
6947 return 0;
6948 else
6949 return WC_NO_BEST_FIT_CHARS;
6950 }
6951}
6952
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006954 * Encode a Unicode string to a Windows code page into a byte string in strict
6955 * mode.
6956 *
6957 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006958 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006959 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006960static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006961encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006962 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006963 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964{
Victor Stinner554f3f02010-06-16 23:33:54 +00006965 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006966 BOOL *pusedDefaultChar = &usedDefaultChar;
6967 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006968 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006969 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006970 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 const DWORD flags = encode_code_page_flags(code_page, NULL);
6972 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006973 /* Create a substring so that we can get the UTF-16 representation
6974 of just the slice under consideration. */
6975 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976
Martin v. Löwis3d325192011-11-04 18:23:06 +01006977 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006978
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006980 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006982 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006983
Victor Stinner2fc507f2011-11-04 20:06:39 +01006984 substring = PyUnicode_Substring(unicode, offset, offset+len);
6985 if (substring == NULL)
6986 return -1;
6987 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6988 if (p == NULL) {
6989 Py_DECREF(substring);
6990 return -1;
6991 }
Victor Stinner9f067f42013-06-05 00:21:31 +02006992 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01006993
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006994 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006995 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02006996 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006997 NULL, 0,
6998 NULL, pusedDefaultChar);
6999 if (outsize <= 0)
7000 goto error;
7001 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007002 if (pusedDefaultChar && *pusedDefaultChar) {
7003 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007004 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007005 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007006
Victor Stinner3a50e702011-10-18 21:21:00 +02007007 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007010 if (*outbytes == NULL) {
7011 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007013 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007015 }
7016 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 const Py_ssize_t n = PyBytes_Size(*outbytes);
7019 if (outsize > PY_SSIZE_T_MAX - n) {
7020 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007021 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007023 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007024 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7025 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007026 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007027 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029 }
7030
7031 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007032 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007033 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007034 out, outsize,
7035 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007036 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007037 if (outsize <= 0)
7038 goto error;
7039 if (pusedDefaultChar && *pusedDefaultChar)
7040 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007041 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007042
Victor Stinner3a50e702011-10-18 21:21:00 +02007043error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007044 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007045 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7046 return -2;
7047 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007048 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007049}
7050
Victor Stinner3a50e702011-10-18 21:21:00 +02007051/*
7052 * Encode a Unicode string to a Windows code page into a byte string using a
7053 * error handler.
7054 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007055 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007056 * -1 on other error.
7057 */
7058static int
7059encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007060 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007061 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007062{
Victor Stinner3a50e702011-10-18 21:21:00 +02007063 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007064 Py_ssize_t pos = unicode_offset;
7065 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007066 /* Ideally, we should get reason from FormatMessage. This is the Windows
7067 2000 English version of the message. */
7068 const char *reason = "invalid character";
7069 /* 4=maximum length of a UTF-8 sequence */
7070 char buffer[4];
7071 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7072 Py_ssize_t outsize;
7073 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 PyObject *errorHandler = NULL;
7075 PyObject *exc = NULL;
7076 PyObject *encoding_obj = NULL;
7077 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007078 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007079 PyObject *rep;
7080 int ret = -1;
7081
7082 assert(insize > 0);
7083
7084 encoding = code_page_name(code_page, &encoding_obj);
7085 if (encoding == NULL)
7086 return -1;
7087
7088 if (errors == NULL || strcmp(errors, "strict") == 0) {
7089 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7090 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007091 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 if (exc != NULL) {
7093 PyCodec_StrictErrors(exc);
7094 Py_DECREF(exc);
7095 }
7096 Py_XDECREF(encoding_obj);
7097 return -1;
7098 }
7099
7100 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7101 pusedDefaultChar = &usedDefaultChar;
7102 else
7103 pusedDefaultChar = NULL;
7104
7105 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7106 PyErr_NoMemory();
7107 goto error;
7108 }
7109 outsize = insize * Py_ARRAY_LENGTH(buffer);
7110
7111 if (*outbytes == NULL) {
7112 /* Create string object */
7113 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7114 if (*outbytes == NULL)
7115 goto error;
7116 out = PyBytes_AS_STRING(*outbytes);
7117 }
7118 else {
7119 /* Extend string object */
7120 Py_ssize_t n = PyBytes_Size(*outbytes);
7121 if (n > PY_SSIZE_T_MAX - outsize) {
7122 PyErr_NoMemory();
7123 goto error;
7124 }
7125 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7126 goto error;
7127 out = PyBytes_AS_STRING(*outbytes) + n;
7128 }
7129
7130 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007131 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007132 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007133 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7134 wchar_t chars[2];
7135 int charsize;
7136 if (ch < 0x10000) {
7137 chars[0] = (wchar_t)ch;
7138 charsize = 1;
7139 }
7140 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007141 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7142 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007143 charsize = 2;
7144 }
7145
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007147 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007148 buffer, Py_ARRAY_LENGTH(buffer),
7149 NULL, pusedDefaultChar);
7150 if (outsize > 0) {
7151 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7152 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007153 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 memcpy(out, buffer, outsize);
7155 out += outsize;
7156 continue;
7157 }
7158 }
7159 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7160 PyErr_SetFromWindowsErr(0);
7161 goto error;
7162 }
7163
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 rep = unicode_encode_call_errorhandler(
7165 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007166 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007167 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 if (rep == NULL)
7169 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007170 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007171
7172 if (PyBytes_Check(rep)) {
7173 outsize = PyBytes_GET_SIZE(rep);
7174 if (outsize != 1) {
7175 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7176 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7177 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7178 Py_DECREF(rep);
7179 goto error;
7180 }
7181 out = PyBytes_AS_STRING(*outbytes) + offset;
7182 }
7183 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7184 out += outsize;
7185 }
7186 else {
7187 Py_ssize_t i;
7188 enum PyUnicode_Kind kind;
7189 void *data;
7190
Benjamin Petersonbac79492012-01-14 13:34:47 -05007191 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 Py_DECREF(rep);
7193 goto error;
7194 }
7195
7196 outsize = PyUnicode_GET_LENGTH(rep);
7197 if (outsize != 1) {
7198 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7199 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7200 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7201 Py_DECREF(rep);
7202 goto error;
7203 }
7204 out = PyBytes_AS_STRING(*outbytes) + offset;
7205 }
7206 kind = PyUnicode_KIND(rep);
7207 data = PyUnicode_DATA(rep);
7208 for (i=0; i < outsize; i++) {
7209 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7210 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007211 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007212 encoding, unicode,
7213 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 "unable to encode error handler result to ASCII");
7215 Py_DECREF(rep);
7216 goto error;
7217 }
7218 *out = (unsigned char)ch;
7219 out++;
7220 }
7221 }
7222 Py_DECREF(rep);
7223 }
7224 /* write a NUL byte */
7225 *out = 0;
7226 outsize = out - PyBytes_AS_STRING(*outbytes);
7227 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7228 if (_PyBytes_Resize(outbytes, outsize) < 0)
7229 goto error;
7230 ret = 0;
7231
7232error:
7233 Py_XDECREF(encoding_obj);
7234 Py_XDECREF(errorHandler);
7235 Py_XDECREF(exc);
7236 return ret;
7237}
7238
Victor Stinner3a50e702011-10-18 21:21:00 +02007239static PyObject *
7240encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007241 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 const char *errors)
7243{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007244 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007246 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007247 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007248
Benjamin Petersonbac79492012-01-14 13:34:47 -05007249 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007250 return NULL;
7251 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007252
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 if (code_page < 0) {
7254 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7255 return NULL;
7256 }
7257
Martin v. Löwis3d325192011-11-04 18:23:06 +01007258 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007259 return PyBytes_FromStringAndSize(NULL, 0);
7260
Victor Stinner7581cef2011-11-03 22:32:33 +01007261 offset = 0;
7262 do
7263 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007264#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007265 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007266 chunks. */
7267 if (len > INT_MAX/2) {
7268 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007269 done = 0;
7270 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007271 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007273 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007274 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007275 done = 1;
7276 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007277
Victor Stinner76a31a62011-11-04 00:05:13 +01007278 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007279 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007280 errors);
7281 if (ret == -2)
7282 ret = encode_code_page_errors(code_page, &outbytes,
7283 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007284 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007285 if (ret < 0) {
7286 Py_XDECREF(outbytes);
7287 return NULL;
7288 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007289
Victor Stinner7581cef2011-11-03 22:32:33 +01007290 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007291 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007292 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 return outbytes;
7295}
7296
7297PyObject *
7298PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7299 Py_ssize_t size,
7300 const char *errors)
7301{
Victor Stinner7581cef2011-11-03 22:32:33 +01007302 PyObject *unicode, *res;
7303 unicode = PyUnicode_FromUnicode(p, size);
7304 if (unicode == NULL)
7305 return NULL;
7306 res = encode_code_page(CP_ACP, unicode, errors);
7307 Py_DECREF(unicode);
7308 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007309}
7310
7311PyObject *
7312PyUnicode_EncodeCodePage(int code_page,
7313 PyObject *unicode,
7314 const char *errors)
7315{
Victor Stinner7581cef2011-11-03 22:32:33 +01007316 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007317}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007318
Alexander Belopolsky40018472011-02-26 01:02:56 +00007319PyObject *
7320PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007321{
7322 if (!PyUnicode_Check(unicode)) {
7323 PyErr_BadArgument();
7324 return NULL;
7325 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007326 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007327}
7328
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007329#undef NEED_RETRY
7330
Victor Stinner99b95382011-07-04 14:23:54 +02007331#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007332
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333/* --- Character Mapping Codec -------------------------------------------- */
7334
Victor Stinnerfb161b12013-04-18 01:44:27 +02007335static int
7336charmap_decode_string(const char *s,
7337 Py_ssize_t size,
7338 PyObject *mapping,
7339 const char *errors,
7340 _PyUnicodeWriter *writer)
7341{
7342 const char *starts = s;
7343 const char *e;
7344 Py_ssize_t startinpos, endinpos;
7345 PyObject *errorHandler = NULL, *exc = NULL;
7346 Py_ssize_t maplen;
7347 enum PyUnicode_Kind mapkind;
7348 void *mapdata;
7349 Py_UCS4 x;
7350 unsigned char ch;
7351
7352 if (PyUnicode_READY(mapping) == -1)
7353 return -1;
7354
7355 maplen = PyUnicode_GET_LENGTH(mapping);
7356 mapdata = PyUnicode_DATA(mapping);
7357 mapkind = PyUnicode_KIND(mapping);
7358
7359 e = s + size;
7360
7361 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7362 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7363 * is disabled in encoding aliases, latin1 is preferred because
7364 * its implementation is faster. */
7365 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7366 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7367 Py_UCS4 maxchar = writer->maxchar;
7368
7369 assert (writer->kind == PyUnicode_1BYTE_KIND);
7370 while (s < e) {
7371 ch = *s;
7372 x = mapdata_ucs1[ch];
7373 if (x > maxchar) {
7374 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7375 goto onError;
7376 maxchar = writer->maxchar;
7377 outdata = (Py_UCS1 *)writer->data;
7378 }
7379 outdata[writer->pos] = x;
7380 writer->pos++;
7381 ++s;
7382 }
7383 return 0;
7384 }
7385
7386 while (s < e) {
7387 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7388 enum PyUnicode_Kind outkind = writer->kind;
7389 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7390 if (outkind == PyUnicode_1BYTE_KIND) {
7391 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7392 Py_UCS4 maxchar = writer->maxchar;
7393 while (s < e) {
7394 ch = *s;
7395 x = mapdata_ucs2[ch];
7396 if (x > maxchar)
7397 goto Error;
7398 outdata[writer->pos] = x;
7399 writer->pos++;
7400 ++s;
7401 }
7402 break;
7403 }
7404 else if (outkind == PyUnicode_2BYTE_KIND) {
7405 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7406 while (s < e) {
7407 ch = *s;
7408 x = mapdata_ucs2[ch];
7409 if (x == 0xFFFE)
7410 goto Error;
7411 outdata[writer->pos] = x;
7412 writer->pos++;
7413 ++s;
7414 }
7415 break;
7416 }
7417 }
7418 ch = *s;
7419
7420 if (ch < maplen)
7421 x = PyUnicode_READ(mapkind, mapdata, ch);
7422 else
7423 x = 0xfffe; /* invalid value */
7424Error:
7425 if (x == 0xfffe)
7426 {
7427 /* undefined mapping */
7428 startinpos = s-starts;
7429 endinpos = startinpos+1;
7430 if (unicode_decode_call_errorhandler_writer(
7431 errors, &errorHandler,
7432 "charmap", "character maps to <undefined>",
7433 &starts, &e, &startinpos, &endinpos, &exc, &s,
7434 writer)) {
7435 goto onError;
7436 }
7437 continue;
7438 }
7439
7440 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7441 goto onError;
7442 ++s;
7443 }
7444 Py_XDECREF(errorHandler);
7445 Py_XDECREF(exc);
7446 return 0;
7447
7448onError:
7449 Py_XDECREF(errorHandler);
7450 Py_XDECREF(exc);
7451 return -1;
7452}
7453
7454static int
7455charmap_decode_mapping(const char *s,
7456 Py_ssize_t size,
7457 PyObject *mapping,
7458 const char *errors,
7459 _PyUnicodeWriter *writer)
7460{
7461 const char *starts = s;
7462 const char *e;
7463 Py_ssize_t startinpos, endinpos;
7464 PyObject *errorHandler = NULL, *exc = NULL;
7465 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007466 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007467
7468 e = s + size;
7469
7470 while (s < e) {
7471 ch = *s;
7472
7473 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7474 key = PyLong_FromLong((long)ch);
7475 if (key == NULL)
7476 goto onError;
7477
7478 item = PyObject_GetItem(mapping, key);
7479 Py_DECREF(key);
7480 if (item == NULL) {
7481 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7482 /* No mapping found means: mapping is undefined. */
7483 PyErr_Clear();
7484 goto Undefined;
7485 } else
7486 goto onError;
7487 }
7488
7489 /* Apply mapping */
7490 if (item == Py_None)
7491 goto Undefined;
7492 if (PyLong_Check(item)) {
7493 long value = PyLong_AS_LONG(item);
7494 if (value == 0xFFFE)
7495 goto Undefined;
7496 if (value < 0 || value > MAX_UNICODE) {
7497 PyErr_Format(PyExc_TypeError,
7498 "character mapping must be in range(0x%lx)",
7499 (unsigned long)MAX_UNICODE + 1);
7500 goto onError;
7501 }
7502
7503 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7504 goto onError;
7505 }
7506 else if (PyUnicode_Check(item)) {
7507 if (PyUnicode_READY(item) == -1)
7508 goto onError;
7509 if (PyUnicode_GET_LENGTH(item) == 1) {
7510 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7511 if (value == 0xFFFE)
7512 goto Undefined;
7513 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7514 goto onError;
7515 }
7516 else {
7517 writer->overallocate = 1;
7518 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7519 goto onError;
7520 }
7521 }
7522 else {
7523 /* wrong return value */
7524 PyErr_SetString(PyExc_TypeError,
7525 "character mapping must return integer, None or str");
7526 goto onError;
7527 }
7528 Py_CLEAR(item);
7529 ++s;
7530 continue;
7531
7532Undefined:
7533 /* undefined mapping */
7534 Py_CLEAR(item);
7535 startinpos = s-starts;
7536 endinpos = startinpos+1;
7537 if (unicode_decode_call_errorhandler_writer(
7538 errors, &errorHandler,
7539 "charmap", "character maps to <undefined>",
7540 &starts, &e, &startinpos, &endinpos, &exc, &s,
7541 writer)) {
7542 goto onError;
7543 }
7544 }
7545 Py_XDECREF(errorHandler);
7546 Py_XDECREF(exc);
7547 return 0;
7548
7549onError:
7550 Py_XDECREF(item);
7551 Py_XDECREF(errorHandler);
7552 Py_XDECREF(exc);
7553 return -1;
7554}
7555
Alexander Belopolsky40018472011-02-26 01:02:56 +00007556PyObject *
7557PyUnicode_DecodeCharmap(const char *s,
7558 Py_ssize_t size,
7559 PyObject *mapping,
7560 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007562 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007563
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 /* Default to Latin-1 */
7565 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007569 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007570 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007571 writer.min_length = size;
7572 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007574
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007575 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007576 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7577 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007578 }
7579 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007580 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7581 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007583 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007584
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007586 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587 return NULL;
7588}
7589
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007590/* Charmap encoding: the lookup table */
7591
Alexander Belopolsky40018472011-02-26 01:02:56 +00007592struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 PyObject_HEAD
7594 unsigned char level1[32];
7595 int count2, count3;
7596 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007597};
7598
7599static PyObject*
7600encoding_map_size(PyObject *obj, PyObject* args)
7601{
7602 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007603 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007605}
7606
7607static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007608 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 PyDoc_STR("Return the size (in bytes) of this object") },
7610 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007611};
7612
7613static void
7614encoding_map_dealloc(PyObject* o)
7615{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007616 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007617}
7618
7619static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 "EncodingMap", /*tp_name*/
7622 sizeof(struct encoding_map), /*tp_basicsize*/
7623 0, /*tp_itemsize*/
7624 /* methods */
7625 encoding_map_dealloc, /*tp_dealloc*/
7626 0, /*tp_print*/
7627 0, /*tp_getattr*/
7628 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007629 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 0, /*tp_repr*/
7631 0, /*tp_as_number*/
7632 0, /*tp_as_sequence*/
7633 0, /*tp_as_mapping*/
7634 0, /*tp_hash*/
7635 0, /*tp_call*/
7636 0, /*tp_str*/
7637 0, /*tp_getattro*/
7638 0, /*tp_setattro*/
7639 0, /*tp_as_buffer*/
7640 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7641 0, /*tp_doc*/
7642 0, /*tp_traverse*/
7643 0, /*tp_clear*/
7644 0, /*tp_richcompare*/
7645 0, /*tp_weaklistoffset*/
7646 0, /*tp_iter*/
7647 0, /*tp_iternext*/
7648 encoding_map_methods, /*tp_methods*/
7649 0, /*tp_members*/
7650 0, /*tp_getset*/
7651 0, /*tp_base*/
7652 0, /*tp_dict*/
7653 0, /*tp_descr_get*/
7654 0, /*tp_descr_set*/
7655 0, /*tp_dictoffset*/
7656 0, /*tp_init*/
7657 0, /*tp_alloc*/
7658 0, /*tp_new*/
7659 0, /*tp_free*/
7660 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007661};
7662
7663PyObject*
7664PyUnicode_BuildEncodingMap(PyObject* string)
7665{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007666 PyObject *result;
7667 struct encoding_map *mresult;
7668 int i;
7669 int need_dict = 0;
7670 unsigned char level1[32];
7671 unsigned char level2[512];
7672 unsigned char *mlevel1, *mlevel2, *mlevel3;
7673 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007674 int kind;
7675 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007676 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007677 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007678
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007679 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007680 PyErr_BadArgument();
7681 return NULL;
7682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007683 kind = PyUnicode_KIND(string);
7684 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007685 length = PyUnicode_GET_LENGTH(string);
7686 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007687 memset(level1, 0xFF, sizeof level1);
7688 memset(level2, 0xFF, sizeof level2);
7689
7690 /* If there isn't a one-to-one mapping of NULL to \0,
7691 or if there are non-BMP characters, we need to use
7692 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007693 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007694 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007695 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007696 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007697 ch = PyUnicode_READ(kind, data, i);
7698 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007699 need_dict = 1;
7700 break;
7701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007702 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007703 /* unmapped character */
7704 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007705 l1 = ch >> 11;
7706 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007707 if (level1[l1] == 0xFF)
7708 level1[l1] = count2++;
7709 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007710 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007711 }
7712
7713 if (count2 >= 0xFF || count3 >= 0xFF)
7714 need_dict = 1;
7715
7716 if (need_dict) {
7717 PyObject *result = PyDict_New();
7718 PyObject *key, *value;
7719 if (!result)
7720 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007721 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007722 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007723 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007724 if (!key || !value)
7725 goto failed1;
7726 if (PyDict_SetItem(result, key, value) == -1)
7727 goto failed1;
7728 Py_DECREF(key);
7729 Py_DECREF(value);
7730 }
7731 return result;
7732 failed1:
7733 Py_XDECREF(key);
7734 Py_XDECREF(value);
7735 Py_DECREF(result);
7736 return NULL;
7737 }
7738
7739 /* Create a three-level trie */
7740 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7741 16*count2 + 128*count3 - 1);
7742 if (!result)
7743 return PyErr_NoMemory();
7744 PyObject_Init(result, &EncodingMapType);
7745 mresult = (struct encoding_map*)result;
7746 mresult->count2 = count2;
7747 mresult->count3 = count3;
7748 mlevel1 = mresult->level1;
7749 mlevel2 = mresult->level23;
7750 mlevel3 = mresult->level23 + 16*count2;
7751 memcpy(mlevel1, level1, 32);
7752 memset(mlevel2, 0xFF, 16*count2);
7753 memset(mlevel3, 0, 128*count3);
7754 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007755 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007756 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007757 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7758 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007759 /* unmapped character */
7760 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007761 o1 = ch>>11;
7762 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763 i2 = 16*mlevel1[o1] + o2;
7764 if (mlevel2[i2] == 0xFF)
7765 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007766 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767 i3 = 128*mlevel2[i2] + o3;
7768 mlevel3[i3] = i;
7769 }
7770 return result;
7771}
7772
7773static int
Victor Stinner22168992011-11-20 17:09:18 +01007774encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775{
7776 struct encoding_map *map = (struct encoding_map*)mapping;
7777 int l1 = c>>11;
7778 int l2 = (c>>7) & 0xF;
7779 int l3 = c & 0x7F;
7780 int i;
7781
Victor Stinner22168992011-11-20 17:09:18 +01007782 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007784 if (c == 0)
7785 return 0;
7786 /* level 1*/
7787 i = map->level1[l1];
7788 if (i == 0xFF) {
7789 return -1;
7790 }
7791 /* level 2*/
7792 i = map->level23[16*i+l2];
7793 if (i == 0xFF) {
7794 return -1;
7795 }
7796 /* level 3 */
7797 i = map->level23[16*map->count2 + 128*i + l3];
7798 if (i == 0) {
7799 return -1;
7800 }
7801 return i;
7802}
7803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007804/* Lookup the character ch in the mapping. If the character
7805 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007806 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007807static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007808charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809{
Christian Heimes217cfd12007-12-02 14:31:20 +00007810 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007811 PyObject *x;
7812
7813 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 x = PyObject_GetItem(mapping, w);
7816 Py_DECREF(w);
7817 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7819 /* No mapping found means: mapping is undefined. */
7820 PyErr_Clear();
7821 x = Py_None;
7822 Py_INCREF(x);
7823 return x;
7824 } else
7825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007827 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007829 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 long value = PyLong_AS_LONG(x);
7831 if (value < 0 || value > 255) {
7832 PyErr_SetString(PyExc_TypeError,
7833 "character mapping must be in range(256)");
7834 Py_DECREF(x);
7835 return NULL;
7836 }
7837 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007839 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 /* wrong return value */
7843 PyErr_Format(PyExc_TypeError,
7844 "character mapping must return integer, bytes or None, not %.400s",
7845 x->ob_type->tp_name);
7846 Py_DECREF(x);
7847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 }
7849}
7850
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007851static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007852charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7855 /* exponentially overallocate to minimize reallocations */
7856 if (requiredsize < 2*outsize)
7857 requiredsize = 2*outsize;
7858 if (_PyBytes_Resize(outobj, requiredsize))
7859 return -1;
7860 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861}
7862
Benjamin Peterson14339b62009-01-31 16:36:08 +00007863typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007865} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007867 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868 space is available. Return a new reference to the object that
7869 was put in the output buffer, or Py_None, if the mapping was undefined
7870 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007871 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007872static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007873charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007874 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007876 PyObject *rep;
7877 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007878 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007879
Christian Heimes90aa7642007-12-19 02:45:37 +00007880 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 if (res == -1)
7884 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 if (outsize<requiredsize)
7886 if (charmapencode_resize(outobj, outpos, requiredsize))
7887 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007888 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 outstart[(*outpos)++] = (char)res;
7890 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 }
7892
7893 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007896 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 Py_DECREF(rep);
7898 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 if (PyLong_Check(rep)) {
7901 Py_ssize_t requiredsize = *outpos+1;
7902 if (outsize<requiredsize)
7903 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7904 Py_DECREF(rep);
7905 return enc_EXCEPTION;
7906 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007907 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007909 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 else {
7911 const char *repchars = PyBytes_AS_STRING(rep);
7912 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7913 Py_ssize_t requiredsize = *outpos+repsize;
7914 if (outsize<requiredsize)
7915 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7916 Py_DECREF(rep);
7917 return enc_EXCEPTION;
7918 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007919 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 memcpy(outstart + *outpos, repchars, repsize);
7921 *outpos += repsize;
7922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007923 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924 Py_DECREF(rep);
7925 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007926}
7927
7928/* handle an error in PyUnicode_EncodeCharmap
7929 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007930static int
7931charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007932 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007934 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007935 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936{
7937 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007938 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007939 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007940 enum PyUnicode_Kind kind;
7941 void *data;
7942 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007944 Py_ssize_t collstartpos = *inpos;
7945 Py_ssize_t collendpos = *inpos+1;
7946 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947 char *encoding = "charmap";
7948 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007950 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007951 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952
Benjamin Petersonbac79492012-01-14 13:34:47 -05007953 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007954 return -1;
7955 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956 /* find all unencodable characters */
7957 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007958 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007959 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007960 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007961 val = encoding_map_lookup(ch, mapping);
7962 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 break;
7964 ++collendpos;
7965 continue;
7966 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007967
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007968 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7969 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 if (rep==NULL)
7971 return -1;
7972 else if (rep!=Py_None) {
7973 Py_DECREF(rep);
7974 break;
7975 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007976 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007978 }
7979 /* cache callback name lookup
7980 * (if not done yet, i.e. it's the first error) */
7981 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 if ((errors==NULL) || (!strcmp(errors, "strict")))
7983 *known_errorHandler = 1;
7984 else if (!strcmp(errors, "replace"))
7985 *known_errorHandler = 2;
7986 else if (!strcmp(errors, "ignore"))
7987 *known_errorHandler = 3;
7988 else if (!strcmp(errors, "xmlcharrefreplace"))
7989 *known_errorHandler = 4;
7990 else
7991 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007992 }
7993 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007994 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007995 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007996 return -1;
7997 case 2: /* replace */
7998 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 x = charmapencode_output('?', mapping, res, respos);
8000 if (x==enc_EXCEPTION) {
8001 return -1;
8002 }
8003 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008004 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 return -1;
8006 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008007 }
8008 /* fall through */
8009 case 3: /* ignore */
8010 *inpos = collendpos;
8011 break;
8012 case 4: /* xmlcharrefreplace */
8013 /* generate replacement (temporarily (mis)uses p) */
8014 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 char buffer[2+29+1+1];
8016 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008017 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 for (cp = buffer; *cp; ++cp) {
8019 x = charmapencode_output(*cp, mapping, res, respos);
8020 if (x==enc_EXCEPTION)
8021 return -1;
8022 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008023 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return -1;
8025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008026 }
8027 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008028 *inpos = collendpos;
8029 break;
8030 default:
8031 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008032 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008036 if (PyBytes_Check(repunicode)) {
8037 /* Directly copy bytes result to output. */
8038 Py_ssize_t outsize = PyBytes_Size(*res);
8039 Py_ssize_t requiredsize;
8040 repsize = PyBytes_Size(repunicode);
8041 requiredsize = *respos + repsize;
8042 if (requiredsize > outsize)
8043 /* Make room for all additional bytes. */
8044 if (charmapencode_resize(res, respos, requiredsize)) {
8045 Py_DECREF(repunicode);
8046 return -1;
8047 }
8048 memcpy(PyBytes_AsString(*res) + *respos,
8049 PyBytes_AsString(repunicode), repsize);
8050 *respos += repsize;
8051 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008052 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008053 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008054 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008055 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008056 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008057 Py_DECREF(repunicode);
8058 return -1;
8059 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008060 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008061 data = PyUnicode_DATA(repunicode);
8062 kind = PyUnicode_KIND(repunicode);
8063 for (index = 0; index < repsize; index++) {
8064 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8065 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008067 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 return -1;
8069 }
8070 else if (x==enc_FAILED) {
8071 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008072 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 return -1;
8074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008075 }
8076 *inpos = newpos;
8077 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 }
8079 return 0;
8080}
8081
Alexander Belopolsky40018472011-02-26 01:02:56 +00008082PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008083_PyUnicode_EncodeCharmap(PyObject *unicode,
8084 PyObject *mapping,
8085 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008087 /* output object */
8088 PyObject *res = NULL;
8089 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008090 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008093 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094 PyObject *errorHandler = NULL;
8095 PyObject *exc = NULL;
8096 /* the following variable is used for caching string comparisons
8097 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8098 * 3=ignore, 4=xmlcharrefreplace */
8099 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008100 void *data;
8101 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102
Benjamin Petersonbac79492012-01-14 13:34:47 -05008103 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008104 return NULL;
8105 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008106 data = PyUnicode_DATA(unicode);
8107 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008108
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 /* Default to Latin-1 */
8110 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008111 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113 /* allocate enough for a simple encoding without
8114 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008115 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008116 if (res == NULL)
8117 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008118 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008122 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008124 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 if (x==enc_EXCEPTION) /* error */
8126 goto onError;
8127 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008128 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 &exc,
8130 &known_errorHandler, &errorHandler, errors,
8131 &res, &respos)) {
8132 goto onError;
8133 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 else
8136 /* done with this character => adjust input position */
8137 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008141 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008142 if (_PyBytes_Resize(&res, respos) < 0)
8143 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 Py_XDECREF(exc);
8146 Py_XDECREF(errorHandler);
8147 return res;
8148
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150 Py_XDECREF(res);
8151 Py_XDECREF(exc);
8152 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 return NULL;
8154}
8155
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008156/* Deprecated */
8157PyObject *
8158PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8159 Py_ssize_t size,
8160 PyObject *mapping,
8161 const char *errors)
8162{
8163 PyObject *result;
8164 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8165 if (unicode == NULL)
8166 return NULL;
8167 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8168 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008169 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008170}
8171
Alexander Belopolsky40018472011-02-26 01:02:56 +00008172PyObject *
8173PyUnicode_AsCharmapString(PyObject *unicode,
8174 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175{
8176 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 PyErr_BadArgument();
8178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008180 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181}
8182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008183/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008184static void
8185make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008186 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008187 Py_ssize_t startpos, Py_ssize_t endpos,
8188 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008190 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 *exceptionObject = _PyUnicodeTranslateError_Create(
8192 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193 }
8194 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8196 goto onError;
8197 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8198 goto onError;
8199 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8200 goto onError;
8201 return;
8202 onError:
8203 Py_DECREF(*exceptionObject);
8204 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205 }
8206}
8207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008208/* error handling callback helper:
8209 build arguments, call the callback and check the arguments,
8210 put the result into newpos and return the replacement string, which
8211 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008212static PyObject *
8213unicode_translate_call_errorhandler(const char *errors,
8214 PyObject **errorHandler,
8215 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008216 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008217 Py_ssize_t startpos, Py_ssize_t endpos,
8218 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008219{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008220 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008222 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 PyObject *restuple;
8224 PyObject *resunicode;
8225
8226 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008228 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008230 }
8231
8232 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236
8237 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008242 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 Py_DECREF(restuple);
8244 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 }
8246 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 &resunicode, &i_newpos)) {
8248 Py_DECREF(restuple);
8249 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008251 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008253 else
8254 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008255 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8257 Py_DECREF(restuple);
8258 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008259 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 Py_INCREF(resunicode);
8261 Py_DECREF(restuple);
8262 return resunicode;
8263}
8264
8265/* Lookup the character ch in the mapping and put the result in result,
8266 which must be decrefed by the caller.
8267 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008268static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008269charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270{
Christian Heimes217cfd12007-12-02 14:31:20 +00008271 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 PyObject *x;
8273
8274 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 x = PyObject_GetItem(mapping, w);
8277 Py_DECREF(w);
8278 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8280 /* No mapping found means: use 1:1 mapping. */
8281 PyErr_Clear();
8282 *result = NULL;
8283 return 0;
8284 } else
8285 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 }
8287 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 *result = x;
8289 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008291 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 long value = PyLong_AS_LONG(x);
8293 long max = PyUnicode_GetMax();
8294 if (value < 0 || value > max) {
8295 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008296 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 Py_DECREF(x);
8298 return -1;
8299 }
8300 *result = x;
8301 return 0;
8302 }
8303 else if (PyUnicode_Check(x)) {
8304 *result = x;
8305 return 0;
8306 }
8307 else {
8308 /* wrong return value */
8309 PyErr_SetString(PyExc_TypeError,
8310 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008311 Py_DECREF(x);
8312 return -1;
8313 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314}
8315/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 if not reallocate and adjust various state variables.
8317 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008318static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008322 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008323 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008324 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 /* exponentially overallocate to minimize reallocations */
8326 if (requiredsize < 2 * oldsize)
8327 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008328 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8329 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008331 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 }
8334 return 0;
8335}
8336/* lookup the character, put the result in the output string and adjust
8337 various state variables. Return a new reference to the object that
8338 was put in the output buffer in *result, or Py_None, if the mapping was
8339 undefined (in which case no character was written).
8340 The called must decref result.
8341 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008342static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8344 PyObject *mapping, Py_UCS4 **output,
8345 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008346 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8349 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 }
8355 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008357 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008359 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 }
8361 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 Py_ssize_t repsize;
8363 if (PyUnicode_READY(*res) == -1)
8364 return -1;
8365 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 if (repsize==1) {
8367 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 }
8370 else if (repsize!=0) {
8371 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 Py_ssize_t requiredsize = *opos +
8373 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 Py_ssize_t i;
8376 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 for(i = 0; i < repsize; i++)
8379 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 }
8382 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 return 0;
8385}
8386
Alexander Belopolsky40018472011-02-26 01:02:56 +00008387PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388_PyUnicode_TranslateCharmap(PyObject *input,
8389 PyObject *mapping,
8390 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 /* input object */
8393 char *idata;
8394 Py_ssize_t size, i;
8395 int kind;
8396 /* output buffer */
8397 Py_UCS4 *output = NULL;
8398 Py_ssize_t osize;
8399 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 char *reason = "character maps to <undefined>";
8403 PyObject *errorHandler = NULL;
8404 PyObject *exc = NULL;
8405 /* the following variable is used for caching string comparisons
8406 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8407 * 3=ignore, 4=xmlcharrefreplace */
8408 int known_errorHandler = -1;
8409
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 PyErr_BadArgument();
8412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 if (PyUnicode_READY(input) == -1)
8416 return NULL;
8417 idata = (char*)PyUnicode_DATA(input);
8418 kind = PyUnicode_KIND(input);
8419 size = PyUnicode_GET_LENGTH(input);
8420 i = 0;
8421
8422 if (size == 0) {
8423 Py_INCREF(input);
8424 return input;
8425 }
8426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 /* allocate enough for a simple 1:1 translation without
8428 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429 osize = size;
8430 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8431 opos = 0;
8432 if (output == NULL) {
8433 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 /* try to encode it */
8439 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 if (charmaptranslate_output(input, i, mapping,
8441 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 Py_XDECREF(x);
8443 goto onError;
8444 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008445 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 else { /* untranslatable character */
8449 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8450 Py_ssize_t repsize;
8451 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 Py_ssize_t collstart = i;
8455 Py_ssize_t collend = i+1;
8456 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 while (collend < size) {
8460 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 goto onError;
8462 Py_XDECREF(x);
8463 if (x!=Py_None)
8464 break;
8465 ++collend;
8466 }
8467 /* cache callback name lookup
8468 * (if not done yet, i.e. it's the first error) */
8469 if (known_errorHandler==-1) {
8470 if ((errors==NULL) || (!strcmp(errors, "strict")))
8471 known_errorHandler = 1;
8472 else if (!strcmp(errors, "replace"))
8473 known_errorHandler = 2;
8474 else if (!strcmp(errors, "ignore"))
8475 known_errorHandler = 3;
8476 else if (!strcmp(errors, "xmlcharrefreplace"))
8477 known_errorHandler = 4;
8478 else
8479 known_errorHandler = 0;
8480 }
8481 switch (known_errorHandler) {
8482 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008483 make_translate_exception(&exc,
8484 input, collstart, collend, reason);
8485 if (exc != NULL)
8486 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 case 2: /* replace */
8489 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 for (coll = collstart; coll<collend; coll++)
8491 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 /* fall through */
8493 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 break;
8496 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 /* generate replacement (temporarily (mis)uses i) */
8498 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 char buffer[2+29+1+1];
8500 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8502 if (charmaptranslate_makespace(&output, &osize,
8503 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 goto onError;
8505 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 break;
8510 default:
8511 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 reason, input, &exc,
8513 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008514 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008516 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008517 Py_DECREF(repunicode);
8518 goto onError;
8519 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 repsize = PyUnicode_GET_LENGTH(repunicode);
8522 if (charmaptranslate_makespace(&output, &osize,
8523 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 Py_DECREF(repunicode);
8525 goto onError;
8526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 for (uni2 = 0; repsize-->0; ++uni2)
8528 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8529 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008531 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008532 }
8533 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8535 if (!res)
8536 goto onError;
8537 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 Py_XDECREF(exc);
8539 Py_XDECREF(errorHandler);
8540 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 Py_XDECREF(exc);
8545 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546 return NULL;
8547}
8548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549/* Deprecated. Use PyUnicode_Translate instead. */
8550PyObject *
8551PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8552 Py_ssize_t size,
8553 PyObject *mapping,
8554 const char *errors)
8555{
Christian Heimes5f520f42012-09-11 14:03:25 +02008556 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8558 if (!unicode)
8559 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008560 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8561 Py_DECREF(unicode);
8562 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563}
8564
Alexander Belopolsky40018472011-02-26 01:02:56 +00008565PyObject *
8566PyUnicode_Translate(PyObject *str,
8567 PyObject *mapping,
8568 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569{
8570 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008571
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 str = PyUnicode_FromObject(str);
8573 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008574 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 Py_DECREF(str);
8577 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578}
Tim Petersced69f82003-09-16 20:30:58 +00008579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008581fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582{
8583 /* No need to call PyUnicode_READY(self) because this function is only
8584 called as a callback from fixup() which does it already. */
8585 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8586 const int kind = PyUnicode_KIND(self);
8587 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008588 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008589 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 Py_ssize_t i;
8591
8592 for (i = 0; i < len; ++i) {
8593 ch = PyUnicode_READ(kind, data, i);
8594 fixed = 0;
8595 if (ch > 127) {
8596 if (Py_UNICODE_ISSPACE(ch))
8597 fixed = ' ';
8598 else {
8599 const int decimal = Py_UNICODE_TODECIMAL(ch);
8600 if (decimal >= 0)
8601 fixed = '0' + decimal;
8602 }
8603 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008604 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008605 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 PyUnicode_WRITE(kind, data, i, fixed);
8607 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008608 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008609 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 }
8612
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008613 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614}
8615
8616PyObject *
8617_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8618{
8619 if (!PyUnicode_Check(unicode)) {
8620 PyErr_BadInternalCall();
8621 return NULL;
8622 }
8623 if (PyUnicode_READY(unicode) == -1)
8624 return NULL;
8625 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8626 /* If the string is already ASCII, just return the same string */
8627 Py_INCREF(unicode);
8628 return unicode;
8629 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008630 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631}
8632
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008633PyObject *
8634PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8635 Py_ssize_t length)
8636{
Victor Stinnerf0124502011-11-21 23:12:56 +01008637 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008638 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008639 Py_UCS4 maxchar;
8640 enum PyUnicode_Kind kind;
8641 void *data;
8642
Victor Stinner99d7ad02012-02-22 13:37:39 +01008643 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008644 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008645 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008646 if (ch > 127) {
8647 int decimal = Py_UNICODE_TODECIMAL(ch);
8648 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008649 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008650 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008651 }
8652 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008653
8654 /* Copy to a new string */
8655 decimal = PyUnicode_New(length, maxchar);
8656 if (decimal == NULL)
8657 return decimal;
8658 kind = PyUnicode_KIND(decimal);
8659 data = PyUnicode_DATA(decimal);
8660 /* Iterate over code points */
8661 for (i = 0; i < length; i++) {
8662 Py_UNICODE ch = s[i];
8663 if (ch > 127) {
8664 int decimal = Py_UNICODE_TODECIMAL(ch);
8665 if (decimal >= 0)
8666 ch = '0' + decimal;
8667 }
8668 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008670 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008671}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008672/* --- Decimal Encoder ---------------------------------------------------- */
8673
Alexander Belopolsky40018472011-02-26 01:02:56 +00008674int
8675PyUnicode_EncodeDecimal(Py_UNICODE *s,
8676 Py_ssize_t length,
8677 char *output,
8678 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008679{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008680 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008681 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008682 enum PyUnicode_Kind kind;
8683 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008684
8685 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 PyErr_BadArgument();
8687 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008688 }
8689
Victor Stinner42bf7752011-11-21 22:52:58 +01008690 unicode = PyUnicode_FromUnicode(s, length);
8691 if (unicode == NULL)
8692 return -1;
8693
Benjamin Petersonbac79492012-01-14 13:34:47 -05008694 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008695 Py_DECREF(unicode);
8696 return -1;
8697 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008698 kind = PyUnicode_KIND(unicode);
8699 data = PyUnicode_DATA(unicode);
8700
Victor Stinnerb84d7232011-11-22 01:50:07 +01008701 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008702 PyObject *exc;
8703 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008705 Py_ssize_t startpos;
8706
8707 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008708
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008710 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008711 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008713 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 decimal = Py_UNICODE_TODECIMAL(ch);
8715 if (decimal >= 0) {
8716 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008717 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 continue;
8719 }
8720 if (0 < ch && ch < 256) {
8721 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008722 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 continue;
8724 }
Victor Stinner6345be92011-11-25 20:09:01 +01008725
Victor Stinner42bf7752011-11-21 22:52:58 +01008726 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008727 exc = NULL;
8728 raise_encode_exception(&exc, "decimal", unicode,
8729 startpos, startpos+1,
8730 "invalid decimal Unicode string");
8731 Py_XDECREF(exc);
8732 Py_DECREF(unicode);
8733 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008734 }
8735 /* 0-terminate the output string */
8736 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008737 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008738 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008739}
8740
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741/* --- Helpers ------------------------------------------------------------ */
8742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008744any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745 Py_ssize_t start,
8746 Py_ssize_t end)
8747{
8748 int kind1, kind2, kind;
8749 void *buf1, *buf2;
8750 Py_ssize_t len1, len2, result;
8751
8752 kind1 = PyUnicode_KIND(s1);
8753 kind2 = PyUnicode_KIND(s2);
8754 kind = kind1 > kind2 ? kind1 : kind2;
8755 buf1 = PyUnicode_DATA(s1);
8756 buf2 = PyUnicode_DATA(s2);
8757 if (kind1 != kind)
8758 buf1 = _PyUnicode_AsKind(s1, kind);
8759 if (!buf1)
8760 return -2;
8761 if (kind2 != kind)
8762 buf2 = _PyUnicode_AsKind(s2, kind);
8763 if (!buf2) {
8764 if (kind1 != kind) PyMem_Free(buf1);
8765 return -2;
8766 }
8767 len1 = PyUnicode_GET_LENGTH(s1);
8768 len2 = PyUnicode_GET_LENGTH(s2);
8769
Victor Stinner794d5672011-10-10 03:21:36 +02008770 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008771 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008772 case PyUnicode_1BYTE_KIND:
8773 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8774 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8775 else
8776 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8777 break;
8778 case PyUnicode_2BYTE_KIND:
8779 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8780 break;
8781 case PyUnicode_4BYTE_KIND:
8782 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8783 break;
8784 default:
8785 assert(0); result = -2;
8786 }
8787 }
8788 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008789 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008790 case PyUnicode_1BYTE_KIND:
8791 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8792 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8793 else
8794 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8795 break;
8796 case PyUnicode_2BYTE_KIND:
8797 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8798 break;
8799 case PyUnicode_4BYTE_KIND:
8800 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8801 break;
8802 default:
8803 assert(0); result = -2;
8804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 }
8806
8807 if (kind1 != kind)
8808 PyMem_Free(buf1);
8809 if (kind2 != kind)
8810 PyMem_Free(buf2);
8811
8812 return result;
8813}
8814
8815Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008816_PyUnicode_InsertThousandsGrouping(
8817 PyObject *unicode, Py_ssize_t index,
8818 Py_ssize_t n_buffer,
8819 void *digits, Py_ssize_t n_digits,
8820 Py_ssize_t min_width,
8821 const char *grouping, PyObject *thousands_sep,
8822 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823{
Victor Stinner41a863c2012-02-24 00:37:51 +01008824 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008825 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008826 Py_ssize_t thousands_sep_len;
8827 Py_ssize_t len;
8828
8829 if (unicode != NULL) {
8830 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008831 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008832 }
8833 else {
8834 kind = PyUnicode_1BYTE_KIND;
8835 data = NULL;
8836 }
8837 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8838 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8839 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8840 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008841 if (thousands_sep_kind < kind) {
8842 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8843 if (!thousands_sep_data)
8844 return -1;
8845 }
8846 else {
8847 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8848 if (!data)
8849 return -1;
8850 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008851 }
8852
Benjamin Petersonead6b532011-12-20 17:23:42 -06008853 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008855 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008856 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008857 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008858 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008859 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008860 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008861 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008862 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008863 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008864 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008865 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008867 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008868 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008869 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008870 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008871 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008874 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008877 break;
8878 default:
8879 assert(0);
8880 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008882 if (unicode != NULL && thousands_sep_kind != kind) {
8883 if (thousands_sep_kind < kind)
8884 PyMem_Free(thousands_sep_data);
8885 else
8886 PyMem_Free(data);
8887 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008888 if (unicode == NULL) {
8889 *maxchar = 127;
8890 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008891 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008892 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008893 }
8894 }
8895 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896}
8897
8898
Thomas Wouters477c8d52006-05-27 19:21:47 +00008899/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008900#define ADJUST_INDICES(start, end, len) \
8901 if (end > len) \
8902 end = len; \
8903 else if (end < 0) { \
8904 end += len; \
8905 if (end < 0) \
8906 end = 0; \
8907 } \
8908 if (start < 0) { \
8909 start += len; \
8910 if (start < 0) \
8911 start = 0; \
8912 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008913
Alexander Belopolsky40018472011-02-26 01:02:56 +00008914Py_ssize_t
8915PyUnicode_Count(PyObject *str,
8916 PyObject *substr,
8917 Py_ssize_t start,
8918 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008920 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008921 PyObject* str_obj;
8922 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 int kind1, kind2, kind;
8924 void *buf1 = NULL, *buf2 = NULL;
8925 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008926
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008927 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008928 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008930 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008931 if (!sub_obj) {
8932 Py_DECREF(str_obj);
8933 return -1;
8934 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008935 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008936 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 Py_DECREF(str_obj);
8938 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 }
Tim Petersced69f82003-09-16 20:30:58 +00008940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 kind1 = PyUnicode_KIND(str_obj);
8942 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008943 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008946 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008947 if (kind2 > kind) {
8948 Py_DECREF(sub_obj);
8949 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008950 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008951 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008952 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008953 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 if (!buf2)
8955 goto onError;
8956 len1 = PyUnicode_GET_LENGTH(str_obj);
8957 len2 = PyUnicode_GET_LENGTH(sub_obj);
8958
8959 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008960 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008962 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8963 result = asciilib_count(
8964 ((Py_UCS1*)buf1) + start, end - start,
8965 buf2, len2, PY_SSIZE_T_MAX
8966 );
8967 else
8968 result = ucs1lib_count(
8969 ((Py_UCS1*)buf1) + start, end - start,
8970 buf2, len2, PY_SSIZE_T_MAX
8971 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 break;
8973 case PyUnicode_2BYTE_KIND:
8974 result = ucs2lib_count(
8975 ((Py_UCS2*)buf1) + start, end - start,
8976 buf2, len2, PY_SSIZE_T_MAX
8977 );
8978 break;
8979 case PyUnicode_4BYTE_KIND:
8980 result = ucs4lib_count(
8981 ((Py_UCS4*)buf1) + start, end - start,
8982 buf2, len2, PY_SSIZE_T_MAX
8983 );
8984 break;
8985 default:
8986 assert(0); result = 0;
8987 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008988
8989 Py_DECREF(sub_obj);
8990 Py_DECREF(str_obj);
8991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 if (kind2 != kind)
8993 PyMem_Free(buf2);
8994
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996 onError:
8997 Py_DECREF(sub_obj);
8998 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 if (kind2 != kind && buf2)
9000 PyMem_Free(buf2);
9001 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002}
9003
Alexander Belopolsky40018472011-02-26 01:02:56 +00009004Py_ssize_t
9005PyUnicode_Find(PyObject *str,
9006 PyObject *sub,
9007 Py_ssize_t start,
9008 Py_ssize_t end,
9009 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009011 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009012
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009014 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009016 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009017 if (!sub) {
9018 Py_DECREF(str);
9019 return -2;
9020 }
9021 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9022 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 Py_DECREF(str);
9024 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 }
Tim Petersced69f82003-09-16 20:30:58 +00009026
Victor Stinner794d5672011-10-10 03:21:36 +02009027 result = any_find_slice(direction,
9028 str, sub, start, end
9029 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009030
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009032 Py_DECREF(sub);
9033
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 return result;
9035}
9036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009037Py_ssize_t
9038PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9039 Py_ssize_t start, Py_ssize_t end,
9040 int direction)
9041{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009043 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 if (PyUnicode_READY(str) == -1)
9045 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009046 if (start < 0 || end < 0) {
9047 PyErr_SetString(PyExc_IndexError, "string index out of range");
9048 return -2;
9049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 if (end > PyUnicode_GET_LENGTH(str))
9051 end = PyUnicode_GET_LENGTH(str);
9052 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009053 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9054 kind, end-start, ch, direction);
9055 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009057 else
9058 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059}
9060
Alexander Belopolsky40018472011-02-26 01:02:56 +00009061static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009062tailmatch(PyObject *self,
9063 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009064 Py_ssize_t start,
9065 Py_ssize_t end,
9066 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 int kind_self;
9069 int kind_sub;
9070 void *data_self;
9071 void *data_sub;
9072 Py_ssize_t offset;
9073 Py_ssize_t i;
9074 Py_ssize_t end_sub;
9075
9076 if (PyUnicode_READY(self) == -1 ||
9077 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009078 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079
9080 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081 return 1;
9082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9084 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 kind_self = PyUnicode_KIND(self);
9089 data_self = PyUnicode_DATA(self);
9090 kind_sub = PyUnicode_KIND(substring);
9091 data_sub = PyUnicode_DATA(substring);
9092 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9093
9094 if (direction > 0)
9095 offset = end;
9096 else
9097 offset = start;
9098
9099 if (PyUnicode_READ(kind_self, data_self, offset) ==
9100 PyUnicode_READ(kind_sub, data_sub, 0) &&
9101 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9102 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9103 /* If both are of the same kind, memcmp is sufficient */
9104 if (kind_self == kind_sub) {
9105 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009106 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 data_sub,
9108 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009109 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 }
9111 /* otherwise we have to compare each character by first accesing it */
9112 else {
9113 /* We do not need to compare 0 and len(substring)-1 because
9114 the if statement above ensured already that they are equal
9115 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116 for (i = 1; i < end_sub; ++i) {
9117 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9118 PyUnicode_READ(kind_sub, data_sub, i))
9119 return 0;
9120 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 }
9124
9125 return 0;
9126}
9127
Alexander Belopolsky40018472011-02-26 01:02:56 +00009128Py_ssize_t
9129PyUnicode_Tailmatch(PyObject *str,
9130 PyObject *substr,
9131 Py_ssize_t start,
9132 Py_ssize_t end,
9133 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009135 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009136
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 str = PyUnicode_FromObject(str);
9138 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 substr = PyUnicode_FromObject(substr);
9141 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 Py_DECREF(str);
9143 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144 }
Tim Petersced69f82003-09-16 20:30:58 +00009145
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009146 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148 Py_DECREF(str);
9149 Py_DECREF(substr);
9150 return result;
9151}
9152
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153/* Apply fixfct filter to the Unicode object self and return a
9154 reference to the modified object */
9155
Alexander Belopolsky40018472011-02-26 01:02:56 +00009156static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009157fixup(PyObject *self,
9158 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 PyObject *u;
9161 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009162 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009164 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009167 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 /* fix functions return the new maximum character in a string,
9170 if the kind of the resulting unicode object does not change,
9171 everything is fine. Otherwise we need to change the string kind
9172 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009173 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009174
9175 if (maxchar_new == 0) {
9176 /* no changes */;
9177 if (PyUnicode_CheckExact(self)) {
9178 Py_DECREF(u);
9179 Py_INCREF(self);
9180 return self;
9181 }
9182 else
9183 return u;
9184 }
9185
Victor Stinnere6abb482012-05-02 01:15:40 +02009186 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187
Victor Stinnereaab6042011-12-11 22:22:39 +01009188 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009190
9191 /* In case the maximum character changed, we need to
9192 convert the string to the new category. */
9193 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9194 if (v == NULL) {
9195 Py_DECREF(u);
9196 return NULL;
9197 }
9198 if (maxchar_new > maxchar_old) {
9199 /* If the maxchar increased so that the kind changed, not all
9200 characters are representable anymore and we need to fix the
9201 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009202 _PyUnicode_FastCopyCharacters(v, 0,
9203 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009204 maxchar_old = fixfct(v);
9205 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 }
9207 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009208 _PyUnicode_FastCopyCharacters(v, 0,
9209 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009210 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009211 Py_DECREF(u);
9212 assert(_PyUnicode_CheckConsistency(v, 1));
9213 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214}
9215
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009216static PyObject *
9217ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009219 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9220 char *resdata, *data = PyUnicode_DATA(self);
9221 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009222
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009223 res = PyUnicode_New(len, 127);
9224 if (res == NULL)
9225 return NULL;
9226 resdata = PyUnicode_DATA(res);
9227 if (lower)
9228 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009230 _Py_bytes_upper(resdata, data, len);
9231 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232}
9233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009235handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009237 Py_ssize_t j;
9238 int final_sigma;
9239 Py_UCS4 c;
9240 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009241
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009242 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9243
9244 where ! is a negation and \p{xxx} is a character with property xxx.
9245 */
9246 for (j = i - 1; j >= 0; j--) {
9247 c = PyUnicode_READ(kind, data, j);
9248 if (!_PyUnicode_IsCaseIgnorable(c))
9249 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009251 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9252 if (final_sigma) {
9253 for (j = i + 1; j < length; j++) {
9254 c = PyUnicode_READ(kind, data, j);
9255 if (!_PyUnicode_IsCaseIgnorable(c))
9256 break;
9257 }
9258 final_sigma = j == length || !_PyUnicode_IsCased(c);
9259 }
9260 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261}
9262
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009263static int
9264lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9265 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009267 /* Obscure special case. */
9268 if (c == 0x3A3) {
9269 mapped[0] = handle_capital_sigma(kind, data, length, i);
9270 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009272 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273}
9274
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009275static Py_ssize_t
9276do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009277{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009278 Py_ssize_t i, k = 0;
9279 int n_res, j;
9280 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009281
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009282 c = PyUnicode_READ(kind, data, 0);
9283 n_res = _PyUnicode_ToUpperFull(c, mapped);
9284 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009285 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009286 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009288 for (i = 1; i < length; i++) {
9289 c = PyUnicode_READ(kind, data, i);
9290 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9291 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009292 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009293 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009294 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009295 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297}
9298
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009299static Py_ssize_t
9300do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9301 Py_ssize_t i, k = 0;
9302
9303 for (i = 0; i < length; i++) {
9304 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9305 int n_res, j;
9306 if (Py_UNICODE_ISUPPER(c)) {
9307 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9308 }
9309 else if (Py_UNICODE_ISLOWER(c)) {
9310 n_res = _PyUnicode_ToUpperFull(c, mapped);
9311 }
9312 else {
9313 n_res = 1;
9314 mapped[0] = c;
9315 }
9316 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009317 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009318 res[k++] = mapped[j];
9319 }
9320 }
9321 return k;
9322}
9323
9324static Py_ssize_t
9325do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9326 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009328 Py_ssize_t i, k = 0;
9329
9330 for (i = 0; i < length; i++) {
9331 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9332 int n_res, j;
9333 if (lower)
9334 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9335 else
9336 n_res = _PyUnicode_ToUpperFull(c, mapped);
9337 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009338 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009339 res[k++] = mapped[j];
9340 }
9341 }
9342 return k;
9343}
9344
9345static Py_ssize_t
9346do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9347{
9348 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9349}
9350
9351static Py_ssize_t
9352do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9353{
9354 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9355}
9356
Benjamin Petersone51757f2012-01-12 21:10:29 -05009357static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009358do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9359{
9360 Py_ssize_t i, k = 0;
9361
9362 for (i = 0; i < length; i++) {
9363 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9364 Py_UCS4 mapped[3];
9365 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9366 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009367 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009368 res[k++] = mapped[j];
9369 }
9370 }
9371 return k;
9372}
9373
9374static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009375do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9376{
9377 Py_ssize_t i, k = 0;
9378 int previous_is_cased;
9379
9380 previous_is_cased = 0;
9381 for (i = 0; i < length; i++) {
9382 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9383 Py_UCS4 mapped[3];
9384 int n_res, j;
9385
9386 if (previous_is_cased)
9387 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9388 else
9389 n_res = _PyUnicode_ToTitleFull(c, mapped);
9390
9391 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009392 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009393 res[k++] = mapped[j];
9394 }
9395
9396 previous_is_cased = _PyUnicode_IsCased(c);
9397 }
9398 return k;
9399}
9400
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009401static PyObject *
9402case_operation(PyObject *self,
9403 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9404{
9405 PyObject *res = NULL;
9406 Py_ssize_t length, newlength = 0;
9407 int kind, outkind;
9408 void *data, *outdata;
9409 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9410
Benjamin Petersoneea48462012-01-16 14:28:50 -05009411 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009412
9413 kind = PyUnicode_KIND(self);
9414 data = PyUnicode_DATA(self);
9415 length = PyUnicode_GET_LENGTH(self);
9416 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9417 if (tmp == NULL)
9418 return PyErr_NoMemory();
9419 newlength = perform(kind, data, length, tmp, &maxchar);
9420 res = PyUnicode_New(newlength, maxchar);
9421 if (res == NULL)
9422 goto leave;
9423 tmpend = tmp + newlength;
9424 outdata = PyUnicode_DATA(res);
9425 outkind = PyUnicode_KIND(res);
9426 switch (outkind) {
9427 case PyUnicode_1BYTE_KIND:
9428 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9429 break;
9430 case PyUnicode_2BYTE_KIND:
9431 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9432 break;
9433 case PyUnicode_4BYTE_KIND:
9434 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9435 break;
9436 default:
9437 assert(0);
9438 break;
9439 }
9440 leave:
9441 PyMem_FREE(tmp);
9442 return res;
9443}
9444
Tim Peters8ce9f162004-08-27 01:49:32 +00009445PyObject *
9446PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009449 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009451 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009452 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9453 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009454 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009456 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009458 int use_memcpy;
9459 unsigned char *res_data = NULL, *sep_data = NULL;
9460 PyObject *last_obj;
9461 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462
Tim Peters05eba1f2004-08-27 21:32:02 +00009463 fseq = PySequence_Fast(seq, "");
9464 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009465 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009466 }
9467
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009468 /* NOTE: the following code can't call back into Python code,
9469 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009470 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009471
Tim Peters05eba1f2004-08-27 21:32:02 +00009472 seqlen = PySequence_Fast_GET_SIZE(fseq);
9473 /* If empty sequence, return u"". */
9474 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009475 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009476 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009477 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009478
Tim Peters05eba1f2004-08-27 21:32:02 +00009479 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009480 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009481 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009482 if (seqlen == 1) {
9483 if (PyUnicode_CheckExact(items[0])) {
9484 res = items[0];
9485 Py_INCREF(res);
9486 Py_DECREF(fseq);
9487 return res;
9488 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009489 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009490 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009491 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009492 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009493 /* Set up sep and seplen */
9494 if (separator == NULL) {
9495 /* fall back to a blank space separator */
9496 sep = PyUnicode_FromOrdinal(' ');
9497 if (!sep)
9498 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009499 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009500 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009501 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009502 else {
9503 if (!PyUnicode_Check(separator)) {
9504 PyErr_Format(PyExc_TypeError,
9505 "separator: expected str instance,"
9506 " %.80s found",
9507 Py_TYPE(separator)->tp_name);
9508 goto onError;
9509 }
9510 if (PyUnicode_READY(separator))
9511 goto onError;
9512 sep = separator;
9513 seplen = PyUnicode_GET_LENGTH(separator);
9514 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9515 /* inc refcount to keep this code path symmetric with the
9516 above case of a blank separator */
9517 Py_INCREF(sep);
9518 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009519 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009520 }
9521
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009522 /* There are at least two things to join, or else we have a subclass
9523 * of str in the sequence.
9524 * Do a pre-pass to figure out the total amount of space we'll
9525 * need (sz), and see whether all argument are strings.
9526 */
9527 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009528#ifdef Py_DEBUG
9529 use_memcpy = 0;
9530#else
9531 use_memcpy = 1;
9532#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009533 for (i = 0; i < seqlen; i++) {
9534 const Py_ssize_t old_sz = sz;
9535 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009536 if (!PyUnicode_Check(item)) {
9537 PyErr_Format(PyExc_TypeError,
9538 "sequence item %zd: expected str instance,"
9539 " %.80s found",
9540 i, Py_TYPE(item)->tp_name);
9541 goto onError;
9542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 if (PyUnicode_READY(item) == -1)
9544 goto onError;
9545 sz += PyUnicode_GET_LENGTH(item);
9546 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009547 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009548 if (i != 0)
9549 sz += seplen;
9550 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9551 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009552 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009553 goto onError;
9554 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009555 if (use_memcpy && last_obj != NULL) {
9556 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9557 use_memcpy = 0;
9558 }
9559 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009560 }
Tim Petersced69f82003-09-16 20:30:58 +00009561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009563 if (res == NULL)
9564 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009565
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009566 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009567#ifdef Py_DEBUG
9568 use_memcpy = 0;
9569#else
9570 if (use_memcpy) {
9571 res_data = PyUnicode_1BYTE_DATA(res);
9572 kind = PyUnicode_KIND(res);
9573 if (seplen != 0)
9574 sep_data = PyUnicode_1BYTE_DATA(sep);
9575 }
9576#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009577 if (use_memcpy) {
9578 for (i = 0; i < seqlen; ++i) {
9579 Py_ssize_t itemlen;
9580 item = items[i];
9581
9582 /* Copy item, and maybe the separator. */
9583 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009584 Py_MEMCPY(res_data,
9585 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009586 kind * seplen);
9587 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009588 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009589
9590 itemlen = PyUnicode_GET_LENGTH(item);
9591 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009592 Py_MEMCPY(res_data,
9593 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009594 kind * itemlen);
9595 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009596 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009597 }
9598 assert(res_data == PyUnicode_1BYTE_DATA(res)
9599 + kind * PyUnicode_GET_LENGTH(res));
9600 }
9601 else {
9602 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9603 Py_ssize_t itemlen;
9604 item = items[i];
9605
9606 /* Copy item, and maybe the separator. */
9607 if (i && seplen != 0) {
9608 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9609 res_offset += seplen;
9610 }
9611
9612 itemlen = PyUnicode_GET_LENGTH(item);
9613 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009614 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009615 res_offset += itemlen;
9616 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009617 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009619 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009620
Tim Peters05eba1f2004-08-27 21:32:02 +00009621 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009623 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625
Benjamin Peterson29060642009-01-31 22:14:21 +00009626 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009627 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009629 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630 return NULL;
9631}
9632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633#define FILL(kind, data, value, start, length) \
9634 do { \
9635 Py_ssize_t i_ = 0; \
9636 assert(kind != PyUnicode_WCHAR_KIND); \
9637 switch ((kind)) { \
9638 case PyUnicode_1BYTE_KIND: { \
9639 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009640 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 break; \
9642 } \
9643 case PyUnicode_2BYTE_KIND: { \
9644 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9645 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9646 break; \
9647 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009648 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9650 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9651 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009652 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 } \
9654 } \
9655 } while (0)
9656
Victor Stinnerd3f08822012-05-29 12:57:52 +02009657void
9658_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9659 Py_UCS4 fill_char)
9660{
9661 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9662 const void *data = PyUnicode_DATA(unicode);
9663 assert(PyUnicode_IS_READY(unicode));
9664 assert(unicode_modifiable(unicode));
9665 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9666 assert(start >= 0);
9667 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9668 FILL(kind, data, fill_char, start, length);
9669}
9670
Victor Stinner3fe55312012-01-04 00:33:50 +01009671Py_ssize_t
9672PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9673 Py_UCS4 fill_char)
9674{
9675 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009676
9677 if (!PyUnicode_Check(unicode)) {
9678 PyErr_BadInternalCall();
9679 return -1;
9680 }
9681 if (PyUnicode_READY(unicode) == -1)
9682 return -1;
9683 if (unicode_check_modifiable(unicode))
9684 return -1;
9685
Victor Stinnerd3f08822012-05-29 12:57:52 +02009686 if (start < 0) {
9687 PyErr_SetString(PyExc_IndexError, "string index out of range");
9688 return -1;
9689 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009690 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9691 PyErr_SetString(PyExc_ValueError,
9692 "fill character is bigger than "
9693 "the string maximum character");
9694 return -1;
9695 }
9696
9697 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9698 length = Py_MIN(maxlen, length);
9699 if (length <= 0)
9700 return 0;
9701
Victor Stinnerd3f08822012-05-29 12:57:52 +02009702 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009703 return length;
9704}
9705
Victor Stinner9310abb2011-10-05 00:59:23 +02009706static PyObject *
9707pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009708 Py_ssize_t left,
9709 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 PyObject *u;
9713 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009714 int kind;
9715 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716
9717 if (left < 0)
9718 left = 0;
9719 if (right < 0)
9720 right = 0;
9721
Victor Stinnerc4b49542011-12-11 22:44:26 +01009722 if (left == 0 && right == 0)
9723 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9726 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009727 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9728 return NULL;
9729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009731 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009733 if (!u)
9734 return NULL;
9735
9736 kind = PyUnicode_KIND(u);
9737 data = PyUnicode_DATA(u);
9738 if (left)
9739 FILL(kind, data, fill, 0, left);
9740 if (right)
9741 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009742 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009743 assert(_PyUnicode_CheckConsistency(u, 1));
9744 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745}
9746
Alexander Belopolsky40018472011-02-26 01:02:56 +00009747PyObject *
9748PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751
9752 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009753 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009754 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009755 if (PyUnicode_READY(string) == -1) {
9756 Py_DECREF(string);
9757 return NULL;
9758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759
Benjamin Petersonead6b532011-12-20 17:23:42 -06009760 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009762 if (PyUnicode_IS_ASCII(string))
9763 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009764 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009765 PyUnicode_GET_LENGTH(string), keepends);
9766 else
9767 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009768 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009769 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 break;
9771 case PyUnicode_2BYTE_KIND:
9772 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009773 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 PyUnicode_GET_LENGTH(string), keepends);
9775 break;
9776 case PyUnicode_4BYTE_KIND:
9777 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 PyUnicode_GET_LENGTH(string), keepends);
9780 break;
9781 default:
9782 assert(0);
9783 list = 0;
9784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785 Py_DECREF(string);
9786 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787}
9788
Alexander Belopolsky40018472011-02-26 01:02:56 +00009789static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009790split(PyObject *self,
9791 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009792 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 int kind1, kind2, kind;
9795 void *buf1, *buf2;
9796 Py_ssize_t len1, len2;
9797 PyObject* out;
9798
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009800 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 if (PyUnicode_READY(self) == -1)
9803 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009806 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009808 if (PyUnicode_IS_ASCII(self))
9809 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009810 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009811 PyUnicode_GET_LENGTH(self), maxcount
9812 );
9813 else
9814 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009815 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009816 PyUnicode_GET_LENGTH(self), maxcount
9817 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 case PyUnicode_2BYTE_KIND:
9819 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 PyUnicode_GET_LENGTH(self), maxcount
9822 );
9823 case PyUnicode_4BYTE_KIND:
9824 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009825 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 PyUnicode_GET_LENGTH(self), maxcount
9827 );
9828 default:
9829 assert(0);
9830 return NULL;
9831 }
9832
9833 if (PyUnicode_READY(substring) == -1)
9834 return NULL;
9835
9836 kind1 = PyUnicode_KIND(self);
9837 kind2 = PyUnicode_KIND(substring);
9838 kind = kind1 > kind2 ? kind1 : kind2;
9839 buf1 = PyUnicode_DATA(self);
9840 buf2 = PyUnicode_DATA(substring);
9841 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009842 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 if (!buf1)
9844 return NULL;
9845 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009846 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 if (!buf2) {
9848 if (kind1 != kind) PyMem_Free(buf1);
9849 return NULL;
9850 }
9851 len1 = PyUnicode_GET_LENGTH(self);
9852 len2 = PyUnicode_GET_LENGTH(substring);
9853
Benjamin Petersonead6b532011-12-20 17:23:42 -06009854 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009856 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9857 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009858 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009859 else
9860 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009861 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 break;
9863 case PyUnicode_2BYTE_KIND:
9864 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009865 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 break;
9867 case PyUnicode_4BYTE_KIND:
9868 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009869 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 break;
9871 default:
9872 out = NULL;
9873 }
9874 if (kind1 != kind)
9875 PyMem_Free(buf1);
9876 if (kind2 != kind)
9877 PyMem_Free(buf2);
9878 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879}
9880
Alexander Belopolsky40018472011-02-26 01:02:56 +00009881static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009882rsplit(PyObject *self,
9883 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009884 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 int kind1, kind2, kind;
9887 void *buf1, *buf2;
9888 Py_ssize_t len1, len2;
9889 PyObject* out;
9890
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009891 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009892 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 if (PyUnicode_READY(self) == -1)
9895 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009898 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009900 if (PyUnicode_IS_ASCII(self))
9901 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009902 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009903 PyUnicode_GET_LENGTH(self), maxcount
9904 );
9905 else
9906 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009907 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009908 PyUnicode_GET_LENGTH(self), maxcount
9909 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 case PyUnicode_2BYTE_KIND:
9911 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 PyUnicode_GET_LENGTH(self), maxcount
9914 );
9915 case PyUnicode_4BYTE_KIND:
9916 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009917 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 PyUnicode_GET_LENGTH(self), maxcount
9919 );
9920 default:
9921 assert(0);
9922 return NULL;
9923 }
9924
9925 if (PyUnicode_READY(substring) == -1)
9926 return NULL;
9927
9928 kind1 = PyUnicode_KIND(self);
9929 kind2 = PyUnicode_KIND(substring);
9930 kind = kind1 > kind2 ? kind1 : kind2;
9931 buf1 = PyUnicode_DATA(self);
9932 buf2 = PyUnicode_DATA(substring);
9933 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009934 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 if (!buf1)
9936 return NULL;
9937 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009938 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 if (!buf2) {
9940 if (kind1 != kind) PyMem_Free(buf1);
9941 return NULL;
9942 }
9943 len1 = PyUnicode_GET_LENGTH(self);
9944 len2 = PyUnicode_GET_LENGTH(substring);
9945
Benjamin Petersonead6b532011-12-20 17:23:42 -06009946 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009948 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9949 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009950 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009951 else
9952 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009953 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 break;
9955 case PyUnicode_2BYTE_KIND:
9956 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009957 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 break;
9959 case PyUnicode_4BYTE_KIND:
9960 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009961 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 break;
9963 default:
9964 out = NULL;
9965 }
9966 if (kind1 != kind)
9967 PyMem_Free(buf1);
9968 if (kind2 != kind)
9969 PyMem_Free(buf2);
9970 return out;
9971}
9972
9973static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009974anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9975 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009977 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009979 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9980 return asciilib_find(buf1, len1, buf2, len2, offset);
9981 else
9982 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 case PyUnicode_2BYTE_KIND:
9984 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9985 case PyUnicode_4BYTE_KIND:
9986 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9987 }
9988 assert(0);
9989 return -1;
9990}
9991
9992static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009993anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9994 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009996 switch (kind) {
9997 case PyUnicode_1BYTE_KIND:
9998 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9999 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10000 else
10001 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10002 case PyUnicode_2BYTE_KIND:
10003 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10004 case PyUnicode_4BYTE_KIND:
10005 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10006 }
10007 assert(0);
10008 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010009}
10010
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010011static void
10012replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10013 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10014{
10015 int kind = PyUnicode_KIND(u);
10016 void *data = PyUnicode_DATA(u);
10017 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10018 if (kind == PyUnicode_1BYTE_KIND) {
10019 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10020 (Py_UCS1 *)data + len,
10021 u1, u2, maxcount);
10022 }
10023 else if (kind == PyUnicode_2BYTE_KIND) {
10024 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10025 (Py_UCS2 *)data + len,
10026 u1, u2, maxcount);
10027 }
10028 else {
10029 assert(kind == PyUnicode_4BYTE_KIND);
10030 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10031 (Py_UCS4 *)data + len,
10032 u1, u2, maxcount);
10033 }
10034}
10035
Alexander Belopolsky40018472011-02-26 01:02:56 +000010036static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037replace(PyObject *self, PyObject *str1,
10038 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 PyObject *u;
10041 char *sbuf = PyUnicode_DATA(self);
10042 char *buf1 = PyUnicode_DATA(str1);
10043 char *buf2 = PyUnicode_DATA(str2);
10044 int srelease = 0, release1 = 0, release2 = 0;
10045 int skind = PyUnicode_KIND(self);
10046 int kind1 = PyUnicode_KIND(str1);
10047 int kind2 = PyUnicode_KIND(str2);
10048 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10049 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10050 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010051 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010052 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053
10054 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010055 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010057 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058
Victor Stinner59de0ee2011-10-07 10:01:28 +020010059 if (str1 == str2)
10060 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061
Victor Stinner49a0a212011-10-12 23:46:10 +020010062 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010063 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10064 if (maxchar < maxchar_str1)
10065 /* substring too wide to be present */
10066 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010067 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10068 /* Replacing str1 with str2 may cause a maxchar reduction in the
10069 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010070 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010071 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010076 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010078 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010079 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010080 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010081
Victor Stinner69ed0f42013-04-09 21:48:24 +020010082 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010083 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010084 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010085 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010086 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010088 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010090
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010091 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10092 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010093 }
10094 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 int rkind = skind;
10096 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010097 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 if (kind1 < rkind) {
10100 /* widen substring */
10101 buf1 = _PyUnicode_AsKind(str1, rkind);
10102 if (!buf1) goto error;
10103 release1 = 1;
10104 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010105 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010106 if (i < 0)
10107 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 if (rkind > kind2) {
10109 /* widen replacement */
10110 buf2 = _PyUnicode_AsKind(str2, rkind);
10111 if (!buf2) goto error;
10112 release2 = 1;
10113 }
10114 else if (rkind < kind2) {
10115 /* widen self and buf1 */
10116 rkind = kind2;
10117 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010118 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 sbuf = _PyUnicode_AsKind(self, rkind);
10120 if (!sbuf) goto error;
10121 srelease = 1;
10122 buf1 = _PyUnicode_AsKind(str1, rkind);
10123 if (!buf1) goto error;
10124 release1 = 1;
10125 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010126 u = PyUnicode_New(slen, maxchar);
10127 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010129 assert(PyUnicode_KIND(u) == rkind);
10130 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010131
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010132 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010133 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010134 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010136 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010138
10139 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010140 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010141 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010143 if (i == -1)
10144 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010145 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010151 }
10152 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010154 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 int rkind = skind;
10156 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 buf1 = _PyUnicode_AsKind(str1, rkind);
10161 if (!buf1) goto error;
10162 release1 = 1;
10163 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 if (n == 0)
10166 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010168 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 buf2 = _PyUnicode_AsKind(str2, rkind);
10170 if (!buf2) goto error;
10171 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010174 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 rkind = kind2;
10176 sbuf = _PyUnicode_AsKind(self, rkind);
10177 if (!sbuf) goto error;
10178 srelease = 1;
10179 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010180 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 buf1 = _PyUnicode_AsKind(str1, rkind);
10182 if (!buf1) goto error;
10183 release1 = 1;
10184 }
10185 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10186 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010187 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 PyErr_SetString(PyExc_OverflowError,
10189 "replace string is too long");
10190 goto error;
10191 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010192 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010193 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010194 _Py_INCREF_UNICODE_EMPTY();
10195 if (!unicode_empty)
10196 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010197 u = unicode_empty;
10198 goto done;
10199 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010200 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 PyErr_SetString(PyExc_OverflowError,
10202 "replace string is too long");
10203 goto error;
10204 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010205 u = PyUnicode_New(new_size, maxchar);
10206 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010208 assert(PyUnicode_KIND(u) == rkind);
10209 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 ires = i = 0;
10211 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010212 while (n-- > 0) {
10213 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010215 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010217 if (j == -1)
10218 break;
10219 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010221 memcpy(res + rkind * ires,
10222 sbuf + rkind * i,
10223 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010225 }
10226 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010237 memcpy(res + rkind * ires,
10238 sbuf + rkind * i,
10239 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010240 }
10241 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 /* interleave */
10243 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010244 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010248 if (--n <= 0)
10249 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 memcpy(res + rkind * ires,
10251 sbuf + rkind * i,
10252 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 ires++;
10254 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010256 memcpy(res + rkind * ires,
10257 sbuf + rkind * i,
10258 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010259 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010260 }
10261
10262 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010263 unicode_adjust_maxchar(&u);
10264 if (u == NULL)
10265 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010267
10268 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (srelease)
10270 PyMem_FREE(sbuf);
10271 if (release1)
10272 PyMem_FREE(buf1);
10273 if (release2)
10274 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010275 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (srelease)
10281 PyMem_FREE(sbuf);
10282 if (release1)
10283 PyMem_FREE(buf1);
10284 if (release2)
10285 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010286 return unicode_result_unchanged(self);
10287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 error:
10289 if (srelease && sbuf)
10290 PyMem_FREE(sbuf);
10291 if (release1 && buf1)
10292 PyMem_FREE(buf1);
10293 if (release2 && buf2)
10294 PyMem_FREE(buf2);
10295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296}
10297
10298/* --- Unicode Object Methods --------------------------------------------- */
10299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010300PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302\n\
10303Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010304characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305
10306static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010307unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010309 if (PyUnicode_READY(self) == -1)
10310 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010311 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312}
10313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010314PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010315 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316\n\
10317Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010318have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319
10320static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010321unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010323 if (PyUnicode_READY(self) == -1)
10324 return NULL;
10325 if (PyUnicode_GET_LENGTH(self) == 0)
10326 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010327 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328}
10329
Benjamin Petersond5890c82012-01-14 13:23:30 -050010330PyDoc_STRVAR(casefold__doc__,
10331 "S.casefold() -> str\n\
10332\n\
10333Return a version of S suitable for caseless comparisons.");
10334
10335static PyObject *
10336unicode_casefold(PyObject *self)
10337{
10338 if (PyUnicode_READY(self) == -1)
10339 return NULL;
10340 if (PyUnicode_IS_ASCII(self))
10341 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010342 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010343}
10344
10345
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010346/* Argument converter. Coerces to a single unicode character */
10347
10348static int
10349convert_uc(PyObject *obj, void *addr)
10350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010352 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010353
Benjamin Peterson14339b62009-01-31 16:36:08 +000010354 uniobj = PyUnicode_FromObject(obj);
10355 if (uniobj == NULL) {
10356 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010357 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010358 return 0;
10359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010361 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010362 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010363 Py_DECREF(uniobj);
10364 return 0;
10365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010367 Py_DECREF(uniobj);
10368 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010369}
10370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010371PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010372 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010374Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010375done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
10377static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010378unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010380 Py_ssize_t marg, left;
10381 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 Py_UCS4 fillchar = ' ';
10383
Victor Stinnere9a29352011-10-01 02:14:59 +020010384 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
Benjamin Petersonbac79492012-01-14 13:34:47 -050010387 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388 return NULL;
10389
Victor Stinnerc4b49542011-12-11 22:44:26 +010010390 if (PyUnicode_GET_LENGTH(self) >= width)
10391 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
Victor Stinnerc4b49542011-12-11 22:44:26 +010010393 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 left = marg / 2 + (marg & width & 1);
10395
Victor Stinner9310abb2011-10-05 00:59:23 +020010396 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397}
10398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399/* This function assumes that str1 and str2 are readied by the caller. */
10400
Marc-André Lemburge5034372000-08-08 08:04:29 +000010401static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010402unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010403{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010404#define COMPARE(TYPE1, TYPE2) \
10405 do { \
10406 TYPE1* p1 = (TYPE1 *)data1; \
10407 TYPE2* p2 = (TYPE2 *)data2; \
10408 TYPE1* end = p1 + len; \
10409 Py_UCS4 c1, c2; \
10410 for (; p1 != end; p1++, p2++) { \
10411 c1 = *p1; \
10412 c2 = *p2; \
10413 if (c1 != c2) \
10414 return (c1 < c2) ? -1 : 1; \
10415 } \
10416 } \
10417 while (0)
10418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 int kind1, kind2;
10420 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010421 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010422
Victor Stinner90db9c42012-10-04 21:53:50 +020010423 /* a string is equal to itself */
10424 if (str1 == str2)
10425 return 0;
10426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 kind1 = PyUnicode_KIND(str1);
10428 kind2 = PyUnicode_KIND(str2);
10429 data1 = PyUnicode_DATA(str1);
10430 data2 = PyUnicode_DATA(str2);
10431 len1 = PyUnicode_GET_LENGTH(str1);
10432 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010433 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010434
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010435 switch(kind1) {
10436 case PyUnicode_1BYTE_KIND:
10437 {
10438 switch(kind2) {
10439 case PyUnicode_1BYTE_KIND:
10440 {
10441 int cmp = memcmp(data1, data2, len);
10442 /* normalize result of memcmp() into the range [-1; 1] */
10443 if (cmp < 0)
10444 return -1;
10445 if (cmp > 0)
10446 return 1;
10447 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010448 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010449 case PyUnicode_2BYTE_KIND:
10450 COMPARE(Py_UCS1, Py_UCS2);
10451 break;
10452 case PyUnicode_4BYTE_KIND:
10453 COMPARE(Py_UCS1, Py_UCS4);
10454 break;
10455 default:
10456 assert(0);
10457 }
10458 break;
10459 }
10460 case PyUnicode_2BYTE_KIND:
10461 {
10462 switch(kind2) {
10463 case PyUnicode_1BYTE_KIND:
10464 COMPARE(Py_UCS2, Py_UCS1);
10465 break;
10466 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010467 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010468 COMPARE(Py_UCS2, Py_UCS2);
10469 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010470 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010471 case PyUnicode_4BYTE_KIND:
10472 COMPARE(Py_UCS2, Py_UCS4);
10473 break;
10474 default:
10475 assert(0);
10476 }
10477 break;
10478 }
10479 case PyUnicode_4BYTE_KIND:
10480 {
10481 switch(kind2) {
10482 case PyUnicode_1BYTE_KIND:
10483 COMPARE(Py_UCS4, Py_UCS1);
10484 break;
10485 case PyUnicode_2BYTE_KIND:
10486 COMPARE(Py_UCS4, Py_UCS2);
10487 break;
10488 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010489 {
10490#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10491 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10492 /* normalize result of wmemcmp() into the range [-1; 1] */
10493 if (cmp < 0)
10494 return -1;
10495 if (cmp > 0)
10496 return 1;
10497#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010498 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010499#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010500 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010501 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010502 default:
10503 assert(0);
10504 }
10505 break;
10506 }
10507 default:
10508 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010509 }
10510
Victor Stinner770e19e2012-10-04 22:59:45 +020010511 if (len1 == len2)
10512 return 0;
10513 if (len1 < len2)
10514 return -1;
10515 else
10516 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010517
10518#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010519}
10520
Victor Stinnere5567ad2012-10-23 02:48:49 +020010521static int
10522unicode_compare_eq(PyObject *str1, PyObject *str2)
10523{
10524 int kind;
10525 void *data1, *data2;
10526 Py_ssize_t len;
10527 int cmp;
10528
10529 /* a string is equal to itself */
10530 if (str1 == str2)
10531 return 1;
10532
10533 len = PyUnicode_GET_LENGTH(str1);
10534 if (PyUnicode_GET_LENGTH(str2) != len)
10535 return 0;
10536 kind = PyUnicode_KIND(str1);
10537 if (PyUnicode_KIND(str2) != kind)
10538 return 0;
10539 data1 = PyUnicode_DATA(str1);
10540 data2 = PyUnicode_DATA(str2);
10541
10542 cmp = memcmp(data1, data2, len * kind);
10543 return (cmp == 0);
10544}
10545
10546
Alexander Belopolsky40018472011-02-26 01:02:56 +000010547int
10548PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10551 if (PyUnicode_READY(left) == -1 ||
10552 PyUnicode_READY(right) == -1)
10553 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010554 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010556 PyErr_Format(PyExc_TypeError,
10557 "Can't compare %.100s and %.100s",
10558 left->ob_type->tp_name,
10559 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560 return -1;
10561}
10562
Martin v. Löwis5b222132007-06-10 09:51:05 +000010563int
10564PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 Py_ssize_t i;
10567 int kind;
10568 void *data;
10569 Py_UCS4 chr;
10570
Victor Stinner910337b2011-10-03 03:20:16 +020010571 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 if (PyUnicode_READY(uni) == -1)
10573 return -1;
10574 kind = PyUnicode_KIND(uni);
10575 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010576 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10578 if (chr != str[i])
10579 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010580 /* This check keeps Python strings that end in '\0' from comparing equal
10581 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010583 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010584 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010586 return 0;
10587}
10588
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010589
Benjamin Peterson29060642009-01-31 22:14:21 +000010590#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010591 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010592
Alexander Belopolsky40018472011-02-26 01:02:56 +000010593PyObject *
10594PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010595{
10596 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010597 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010598
Victor Stinnere5567ad2012-10-23 02:48:49 +020010599 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10600 Py_RETURN_NOTIMPLEMENTED;
10601
10602 if (PyUnicode_READY(left) == -1 ||
10603 PyUnicode_READY(right) == -1)
10604 return NULL;
10605
10606 if (op == Py_EQ || op == Py_NE) {
10607 result = unicode_compare_eq(left, right);
10608 if (op == Py_EQ)
10609 v = TEST_COND(result);
10610 else
10611 v = TEST_COND(!result);
10612 }
10613 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010614 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010615
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010616 /* Convert the return value to a Boolean */
10617 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010618 case Py_LE:
10619 v = TEST_COND(result <= 0);
10620 break;
10621 case Py_GE:
10622 v = TEST_COND(result >= 0);
10623 break;
10624 case Py_LT:
10625 v = TEST_COND(result == -1);
10626 break;
10627 case Py_GT:
10628 v = TEST_COND(result == 1);
10629 break;
10630 default:
10631 PyErr_BadArgument();
10632 return NULL;
10633 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010634 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010635 Py_INCREF(v);
10636 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010637}
10638
Alexander Belopolsky40018472011-02-26 01:02:56 +000010639int
10640PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010641{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010643 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 void *buf1, *buf2;
10645 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010646 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010647
10648 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 sub = PyUnicode_FromObject(element);
10650 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010651 PyErr_Format(PyExc_TypeError,
10652 "'in <string>' requires string as left operand, not %s",
10653 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010655 }
10656
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010658 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 Py_DECREF(sub);
10660 return -1;
10661 }
10662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 kind1 = PyUnicode_KIND(str);
10664 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 buf1 = PyUnicode_DATA(str);
10666 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010667 if (kind2 != kind1) {
10668 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010669 Py_DECREF(sub);
10670 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010671 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010672 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010673 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 if (!buf2) {
10676 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010677 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 return -1;
10679 }
10680 len1 = PyUnicode_GET_LENGTH(str);
10681 len2 = PyUnicode_GET_LENGTH(sub);
10682
Victor Stinner77282cb2013-04-14 19:22:47 +020010683 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 case PyUnicode_1BYTE_KIND:
10685 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10686 break;
10687 case PyUnicode_2BYTE_KIND:
10688 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10689 break;
10690 case PyUnicode_4BYTE_KIND:
10691 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10692 break;
10693 default:
10694 result = -1;
10695 assert(0);
10696 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697
10698 Py_DECREF(str);
10699 Py_DECREF(sub);
10700
Victor Stinner77282cb2013-04-14 19:22:47 +020010701 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 PyMem_Free(buf2);
10703
Guido van Rossum403d68b2000-03-13 15:55:09 +000010704 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010705}
10706
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707/* Concat to string or Unicode object giving a new Unicode object. */
10708
Alexander Belopolsky40018472011-02-26 01:02:56 +000010709PyObject *
10710PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010713 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010714 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715
10716 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010719 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723
10724 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010725 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010729 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732 }
10733
Victor Stinner488fa492011-12-12 00:01:39 +010010734 u_len = PyUnicode_GET_LENGTH(u);
10735 v_len = PyUnicode_GET_LENGTH(v);
10736 if (u_len > PY_SSIZE_T_MAX - v_len) {
10737 PyErr_SetString(PyExc_OverflowError,
10738 "strings are too large to concat");
10739 goto onError;
10740 }
10741 new_len = u_len + v_len;
10742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010744 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010745 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010748 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010751 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10752 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753 Py_DECREF(u);
10754 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010755 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757
Benjamin Peterson29060642009-01-31 22:14:21 +000010758 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 Py_XDECREF(u);
10760 Py_XDECREF(v);
10761 return NULL;
10762}
10763
Walter Dörwald1ab83302007-05-18 17:15:44 +000010764void
Victor Stinner23e56682011-10-03 03:54:37 +020010765PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010766{
Victor Stinner23e56682011-10-03 03:54:37 +020010767 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010768 Py_UCS4 maxchar, maxchar2;
10769 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010770
10771 if (p_left == NULL) {
10772 if (!PyErr_Occurred())
10773 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010774 return;
10775 }
Victor Stinner23e56682011-10-03 03:54:37 +020010776 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020010777 if (right == NULL || left == NULL
10778 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010779 if (!PyErr_Occurred())
10780 PyErr_BadInternalCall();
10781 goto error;
10782 }
10783
Benjamin Petersonbac79492012-01-14 13:34:47 -050010784 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010785 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010786 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010787 goto error;
10788
Victor Stinner488fa492011-12-12 00:01:39 +010010789 /* Shortcuts */
10790 if (left == unicode_empty) {
10791 Py_DECREF(left);
10792 Py_INCREF(right);
10793 *p_left = right;
10794 return;
10795 }
10796 if (right == unicode_empty)
10797 return;
10798
10799 left_len = PyUnicode_GET_LENGTH(left);
10800 right_len = PyUnicode_GET_LENGTH(right);
10801 if (left_len > PY_SSIZE_T_MAX - right_len) {
10802 PyErr_SetString(PyExc_OverflowError,
10803 "strings are too large to concat");
10804 goto error;
10805 }
10806 new_len = left_len + right_len;
10807
10808 if (unicode_modifiable(left)
10809 && PyUnicode_CheckExact(right)
10810 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010811 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10812 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010813 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010814 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010815 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10816 {
10817 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010818 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010010819 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020010820
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010821 /* copy 'right' into the newly allocated area of 'left' */
10822 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010823 }
Victor Stinner488fa492011-12-12 00:01:39 +010010824 else {
10825 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10826 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010827 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010828
Victor Stinner488fa492011-12-12 00:01:39 +010010829 /* Concat the two Unicode strings */
10830 res = PyUnicode_New(new_len, maxchar);
10831 if (res == NULL)
10832 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010833 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10834 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010835 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010836 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010010837 }
10838 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010839 return;
10840
10841error:
Victor Stinner488fa492011-12-12 00:01:39 +010010842 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010843}
10844
10845void
10846PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10847{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010848 PyUnicode_Append(pleft, right);
10849 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010850}
10851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010852PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010853 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010855Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010856string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010857interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858
10859static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010860unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010862 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010863 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010864 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 int kind1, kind2, kind;
10867 void *buf1, *buf2;
10868 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869
Jesus Ceaac451502011-04-20 17:09:23 +020010870 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10871 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010872 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 kind1 = PyUnicode_KIND(self);
10875 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020010876 if (kind2 > kind1) {
10877 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010878 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020010879 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010880 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 buf1 = PyUnicode_DATA(self);
10882 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010884 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010885 if (!buf2) {
10886 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 return NULL;
10888 }
10889 len1 = PyUnicode_GET_LENGTH(self);
10890 len2 = PyUnicode_GET_LENGTH(substring);
10891
10892 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010893 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 case PyUnicode_1BYTE_KIND:
10895 iresult = ucs1lib_count(
10896 ((Py_UCS1*)buf1) + start, end - start,
10897 buf2, len2, PY_SSIZE_T_MAX
10898 );
10899 break;
10900 case PyUnicode_2BYTE_KIND:
10901 iresult = ucs2lib_count(
10902 ((Py_UCS2*)buf1) + start, end - start,
10903 buf2, len2, PY_SSIZE_T_MAX
10904 );
10905 break;
10906 case PyUnicode_4BYTE_KIND:
10907 iresult = ucs4lib_count(
10908 ((Py_UCS4*)buf1) + start, end - start,
10909 buf2, len2, PY_SSIZE_T_MAX
10910 );
10911 break;
10912 default:
10913 assert(0); iresult = 0;
10914 }
10915
10916 result = PyLong_FromSsize_t(iresult);
10917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 if (kind2 != kind)
10919 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920
10921 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923 return result;
10924}
10925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010926PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010927 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010929Encode S using the codec registered for encoding. Default encoding\n\
10930is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010931handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010932a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10933'xmlcharrefreplace' as well as any other name registered with\n\
10934codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935
10936static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010937unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010939 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940 char *encoding = NULL;
10941 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010942
Benjamin Peterson308d6372009-09-18 21:42:35 +000010943 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10944 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010946 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010947}
10948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010949PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010950 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951\n\
10952Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010953If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
10955static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010956unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010958 Py_ssize_t i, j, line_pos, src_len, incr;
10959 Py_UCS4 ch;
10960 PyObject *u;
10961 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010963 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010964 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965
10966 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
Antoine Pitrou22425222011-10-04 19:10:51 +020010969 if (PyUnicode_READY(self) == -1)
10970 return NULL;
10971
Thomas Wouters7e474022000-07-16 12:04:32 +000010972 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010973 src_len = PyUnicode_GET_LENGTH(self);
10974 i = j = line_pos = 0;
10975 kind = PyUnicode_KIND(self);
10976 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010977 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010978 for (; i < src_len; i++) {
10979 ch = PyUnicode_READ(kind, src_data, i);
10980 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010981 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010982 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010983 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010984 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010985 goto overflow;
10986 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010988 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010992 goto overflow;
10993 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010995 if (ch == '\n' || ch == '\r')
10996 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010998 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010999 if (!found)
11000 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011001
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011003 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 if (!u)
11005 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011006 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
Antoine Pitroue71d5742011-10-04 15:55:09 +020011008 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009
Antoine Pitroue71d5742011-10-04 15:55:09 +020011010 for (; i < src_len; i++) {
11011 ch = PyUnicode_READ(kind, src_data, i);
11012 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011013 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011014 incr = tabsize - (line_pos % tabsize);
11015 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011016 FILL(kind, dest_data, ' ', j, incr);
11017 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011019 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011021 line_pos++;
11022 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011023 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011024 if (ch == '\n' || ch == '\r')
11025 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011027 }
11028 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011029 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011030
Antoine Pitroue71d5742011-10-04 15:55:09 +020011031 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011032 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034}
11035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011036PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011037 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038\n\
11039Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011040such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041arguments start and end are interpreted as in slice notation.\n\
11042\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011043Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044
11045static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011048 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011049 Py_ssize_t start;
11050 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011051 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052
Jesus Ceaac451502011-04-20 17:09:23 +020011053 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11054 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056
Christian Heimesd47802e2013-06-29 21:33:36 +020011057 if (PyUnicode_READY(self) == -1) {
11058 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011060 }
11061 if (PyUnicode_READY(substring) == -1) {
11062 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065
Victor Stinner7931d9a2011-11-04 00:22:48 +010011066 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
11068 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 if (result == -2)
11071 return NULL;
11072
Christian Heimes217cfd12007-12-02 14:31:20 +000011073 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074}
11075
11076static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011077unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011079 void *data;
11080 enum PyUnicode_Kind kind;
11081 Py_UCS4 ch;
11082 PyObject *res;
11083
11084 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11085 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011087 }
11088 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11089 PyErr_SetString(PyExc_IndexError, "string index out of range");
11090 return NULL;
11091 }
11092 kind = PyUnicode_KIND(self);
11093 data = PyUnicode_DATA(self);
11094 ch = PyUnicode_READ(kind, data, index);
11095 if (ch < 256)
11096 return get_latin1_char(ch);
11097
11098 res = PyUnicode_New(1, ch);
11099 if (res == NULL)
11100 return NULL;
11101 kind = PyUnicode_KIND(res);
11102 data = PyUnicode_DATA(res);
11103 PyUnicode_WRITE(kind, data, 0, ch);
11104 assert(_PyUnicode_CheckConsistency(res, 1));
11105 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106}
11107
Guido van Rossumc2504932007-09-18 19:42:40 +000011108/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011109 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011110static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011111unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112{
Guido van Rossumc2504932007-09-18 19:42:40 +000011113 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011114 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011115
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011116#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011117 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011118#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 if (_PyUnicode_HASH(self) != -1)
11120 return _PyUnicode_HASH(self);
11121 if (PyUnicode_READY(self) == -1)
11122 return -1;
11123 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011124 /*
11125 We make the hash of the empty string be 0, rather than using
11126 (prefix ^ suffix), since this slightly obfuscates the hash secret
11127 */
11128 if (len == 0) {
11129 _PyUnicode_HASH(self) = 0;
11130 return 0;
11131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132
11133 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011134#define HASH(P) \
11135 x ^= (Py_uhash_t) *P << 7; \
11136 while (--len >= 0) \
11137 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138
Georg Brandl2fb477c2012-02-21 00:33:36 +010011139 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 switch (PyUnicode_KIND(self)) {
11141 case PyUnicode_1BYTE_KIND: {
11142 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11143 HASH(c);
11144 break;
11145 }
11146 case PyUnicode_2BYTE_KIND: {
11147 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11148 HASH(s);
11149 break;
11150 }
11151 default: {
11152 Py_UCS4 *l;
11153 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11154 "Impossible switch case in unicode_hash");
11155 l = PyUnicode_4BYTE_DATA(self);
11156 HASH(l);
11157 break;
11158 }
11159 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011160 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11161 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162
Guido van Rossumc2504932007-09-18 19:42:40 +000011163 if (x == -1)
11164 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011166 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011170PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011171 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011173Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174
11175static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011178 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011179 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011180 Py_ssize_t start;
11181 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182
Jesus Ceaac451502011-04-20 17:09:23 +020011183 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11184 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186
Christian Heimesd47a0452013-06-29 21:21:37 +020011187 if (PyUnicode_READY(self) == -1) {
11188 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011190 }
11191 if (PyUnicode_READY(substring) == -1) {
11192 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195
Victor Stinner7931d9a2011-11-04 00:22:48 +010011196 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
11198 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 if (result == -2)
11201 return NULL;
11202
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203 if (result < 0) {
11204 PyErr_SetString(PyExc_ValueError, "substring not found");
11205 return NULL;
11206 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011207
Christian Heimes217cfd12007-12-02 14:31:20 +000011208 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209}
11210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011211PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011214Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011215at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216
11217static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011218unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 Py_ssize_t i, length;
11221 int kind;
11222 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 int cased;
11224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 if (PyUnicode_READY(self) == -1)
11226 return NULL;
11227 length = PyUnicode_GET_LENGTH(self);
11228 kind = PyUnicode_KIND(self);
11229 data = PyUnicode_DATA(self);
11230
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 if (length == 1)
11233 return PyBool_FromLong(
11234 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011236 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011239
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 for (i = 0; i < length; i++) {
11242 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011243
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11245 return PyBool_FromLong(0);
11246 else if (!cased && Py_UNICODE_ISLOWER(ch))
11247 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011249 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250}
11251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011252PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011255Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011256at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
11258static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011259unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 Py_ssize_t i, length;
11262 int kind;
11263 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264 int cased;
11265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 if (PyUnicode_READY(self) == -1)
11267 return NULL;
11268 length = PyUnicode_GET_LENGTH(self);
11269 kind = PyUnicode_KIND(self);
11270 data = PyUnicode_DATA(self);
11271
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 if (length == 1)
11274 return PyBool_FromLong(
11275 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011277 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011280
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 for (i = 0; i < length; i++) {
11283 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011284
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11286 return PyBool_FromLong(0);
11287 else if (!cased && Py_UNICODE_ISUPPER(ch))
11288 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011290 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291}
11292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011293PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011296Return True if S is a titlecased string and there is at least one\n\
11297character in S, i.e. upper- and titlecase characters may only\n\
11298follow uncased characters and lowercase characters only cased ones.\n\
11299Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300
11301static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011302unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 Py_ssize_t i, length;
11305 int kind;
11306 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307 int cased, previous_is_cased;
11308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 if (PyUnicode_READY(self) == -1)
11310 return NULL;
11311 length = PyUnicode_GET_LENGTH(self);
11312 kind = PyUnicode_KIND(self);
11313 data = PyUnicode_DATA(self);
11314
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 if (length == 1) {
11317 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11318 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11319 (Py_UNICODE_ISUPPER(ch) != 0));
11320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011322 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011324 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011325
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326 cased = 0;
11327 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 for (i = 0; i < length; i++) {
11329 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011330
Benjamin Peterson29060642009-01-31 22:14:21 +000011331 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11332 if (previous_is_cased)
11333 return PyBool_FromLong(0);
11334 previous_is_cased = 1;
11335 cased = 1;
11336 }
11337 else if (Py_UNICODE_ISLOWER(ch)) {
11338 if (!previous_is_cased)
11339 return PyBool_FromLong(0);
11340 previous_is_cased = 1;
11341 cased = 1;
11342 }
11343 else
11344 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011346 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347}
11348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011352Return True if all characters in S are whitespace\n\
11353and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
11355static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011356unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 Py_ssize_t i, length;
11359 int kind;
11360 void *data;
11361
11362 if (PyUnicode_READY(self) == -1)
11363 return NULL;
11364 length = PyUnicode_GET_LENGTH(self);
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 1)
11370 return PyBool_FromLong(
11371 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011373 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 for (i = 0; i < length; i++) {
11378 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011379 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011382 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383}
11384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011385PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011387\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011388Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011389and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011390
11391static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011392unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 Py_ssize_t i, length;
11395 int kind;
11396 void *data;
11397
11398 if (PyUnicode_READY(self) == -1)
11399 return NULL;
11400 length = PyUnicode_GET_LENGTH(self);
11401 kind = PyUnicode_KIND(self);
11402 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011403
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011404 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 if (length == 1)
11406 return PyBool_FromLong(
11407 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011408
11409 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 for (i = 0; i < length; i++) {
11414 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011416 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011417 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011418}
11419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011422\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011423Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011425
11426static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011427unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 int kind;
11430 void *data;
11431 Py_ssize_t len, i;
11432
11433 if (PyUnicode_READY(self) == -1)
11434 return NULL;
11435
11436 kind = PyUnicode_KIND(self);
11437 data = PyUnicode_DATA(self);
11438 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011439
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011440 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 if (len == 1) {
11442 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11443 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11444 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011445
11446 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 for (i = 0; i < len; i++) {
11451 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011452 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011454 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011455 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011456}
11457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011458PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011461Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011462False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
11464static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011465unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 Py_ssize_t i, length;
11468 int kind;
11469 void *data;
11470
11471 if (PyUnicode_READY(self) == -1)
11472 return NULL;
11473 length = PyUnicode_GET_LENGTH(self);
11474 kind = PyUnicode_KIND(self);
11475 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 if (length == 1)
11479 return PyBool_FromLong(
11480 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011482 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 for (i = 0; i < length; i++) {
11487 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011490 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491}
11492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011493PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011496Return True if all characters in S are digits\n\
11497and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
11499static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011500unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 Py_ssize_t i, length;
11503 int kind;
11504 void *data;
11505
11506 if (PyUnicode_READY(self) == -1)
11507 return NULL;
11508 length = PyUnicode_GET_LENGTH(self);
11509 kind = PyUnicode_KIND(self);
11510 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if (length == 1) {
11514 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11515 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011518 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 for (i = 0; i < length; i++) {
11523 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011526 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527}
11528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011529PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011532Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011533False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
11535static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011536unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 Py_ssize_t i, length;
11539 int kind;
11540 void *data;
11541
11542 if (PyUnicode_READY(self) == -1)
11543 return NULL;
11544 length = PyUnicode_GET_LENGTH(self);
11545 kind = PyUnicode_KIND(self);
11546 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 if (length == 1)
11550 return PyBool_FromLong(
11551 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011553 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 for (i = 0; i < length; i++) {
11558 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011561 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562}
11563
Martin v. Löwis47383402007-08-15 07:32:56 +000011564int
11565PyUnicode_IsIdentifier(PyObject *self)
11566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 int kind;
11568 void *data;
11569 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011570 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 if (PyUnicode_READY(self) == -1) {
11573 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 }
11576
11577 /* Special case for empty strings */
11578 if (PyUnicode_GET_LENGTH(self) == 0)
11579 return 0;
11580 kind = PyUnicode_KIND(self);
11581 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011582
11583 /* PEP 3131 says that the first character must be in
11584 XID_Start and subsequent characters in XID_Continue,
11585 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011586 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011587 letters, digits, underscore). However, given the current
11588 definition of XID_Start and XID_Continue, it is sufficient
11589 to check just for these, except that _ must be allowed
11590 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011592 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011593 return 0;
11594
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011595 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011598 return 1;
11599}
11600
11601PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011603\n\
11604Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011605to the language definition.\n\
11606\n\
11607Use keyword.iskeyword() to test for reserved identifiers\n\
11608such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011609
11610static PyObject*
11611unicode_isidentifier(PyObject *self)
11612{
11613 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11614}
11615
Georg Brandl559e5d72008-06-11 18:37:52 +000011616PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011618\n\
11619Return True if all characters in S are considered\n\
11620printable in repr() or S is empty, False otherwise.");
11621
11622static PyObject*
11623unicode_isprintable(PyObject *self)
11624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 Py_ssize_t i, length;
11626 int kind;
11627 void *data;
11628
11629 if (PyUnicode_READY(self) == -1)
11630 return NULL;
11631 length = PyUnicode_GET_LENGTH(self);
11632 kind = PyUnicode_KIND(self);
11633 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011634
11635 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 if (length == 1)
11637 return PyBool_FromLong(
11638 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 for (i = 0; i < length; i++) {
11641 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011642 Py_RETURN_FALSE;
11643 }
11644 }
11645 Py_RETURN_TRUE;
11646}
11647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011648PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011649 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650\n\
11651Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011652iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653
11654static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011655unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011657 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658}
11659
Martin v. Löwis18e16552006-02-15 17:27:45 +000011660static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011661unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 if (PyUnicode_READY(self) == -1)
11664 return -1;
11665 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666}
11667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011668PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011669 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011671Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011672done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673
11674static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011675unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011677 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 Py_UCS4 fillchar = ' ';
11679
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011680 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681 return NULL;
11682
Benjamin Petersonbac79492012-01-14 13:34:47 -050011683 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685
Victor Stinnerc4b49542011-12-11 22:44:26 +010011686 if (PyUnicode_GET_LENGTH(self) >= width)
11687 return unicode_result_unchanged(self);
11688
11689 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690}
11691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011692PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011695Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696
11697static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011698unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011700 if (PyUnicode_READY(self) == -1)
11701 return NULL;
11702 if (PyUnicode_IS_ASCII(self))
11703 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011704 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705}
11706
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011707#define LEFTSTRIP 0
11708#define RIGHTSTRIP 1
11709#define BOTHSTRIP 2
11710
11711/* Arrays indexed by above */
11712static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11713
11714#define STRIPNAME(i) (stripformat[i]+3)
11715
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716/* externally visible for str.strip(unicode) */
11717PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011718_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 void *data;
11721 int kind;
11722 Py_ssize_t i, j, len;
11723 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011724 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011726 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11727 return NULL;
11728
11729 kind = PyUnicode_KIND(self);
11730 data = PyUnicode_DATA(self);
11731 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011732 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11734 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011735 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011736
Benjamin Peterson14339b62009-01-31 16:36:08 +000011737 i = 0;
11738 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011739 while (i < len) {
11740 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11741 if (!BLOOM(sepmask, ch))
11742 break;
11743 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11744 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 i++;
11746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011747 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011748
Benjamin Peterson14339b62009-01-31 16:36:08 +000011749 j = len;
11750 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011751 j--;
11752 while (j >= i) {
11753 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11754 if (!BLOOM(sepmask, ch))
11755 break;
11756 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11757 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011759 }
11760
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011762 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011763
Victor Stinner7931d9a2011-11-04 00:22:48 +010011764 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765}
11766
11767PyObject*
11768PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11769{
11770 unsigned char *data;
11771 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011772 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773
Victor Stinnerde636f32011-10-01 03:55:54 +020011774 if (PyUnicode_READY(self) == -1)
11775 return NULL;
11776
Victor Stinner684d5fd2012-05-03 02:32:34 +020011777 length = PyUnicode_GET_LENGTH(self);
11778 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011779
Victor Stinner684d5fd2012-05-03 02:32:34 +020011780 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011781 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782
Victor Stinnerde636f32011-10-01 03:55:54 +020011783 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011784 PyErr_SetString(PyExc_IndexError, "string index out of range");
11785 return NULL;
11786 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011787 if (start >= length || end < start)
11788 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011789
Victor Stinner684d5fd2012-05-03 02:32:34 +020011790 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011791 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011792 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011793 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011794 }
11795 else {
11796 kind = PyUnicode_KIND(self);
11797 data = PyUnicode_1BYTE_DATA(self);
11798 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011799 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011800 length);
11801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
11804static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011805do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 Py_ssize_t len, i, j;
11808
11809 if (PyUnicode_READY(self) == -1)
11810 return NULL;
11811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011813
Victor Stinnercc7af722013-04-09 22:39:24 +020011814 if (PyUnicode_IS_ASCII(self)) {
11815 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11816
11817 i = 0;
11818 if (striptype != RIGHTSTRIP) {
11819 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011820 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020011821 if (!_Py_ascii_whitespace[ch])
11822 break;
11823 i++;
11824 }
11825 }
11826
11827 j = len;
11828 if (striptype != LEFTSTRIP) {
11829 j--;
11830 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011831 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020011832 if (!_Py_ascii_whitespace[ch])
11833 break;
11834 j--;
11835 }
11836 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011837 }
11838 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011839 else {
11840 int kind = PyUnicode_KIND(self);
11841 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011842
Victor Stinnercc7af722013-04-09 22:39:24 +020011843 i = 0;
11844 if (striptype != RIGHTSTRIP) {
11845 while (i < len) {
11846 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11847 if (!Py_UNICODE_ISSPACE(ch))
11848 break;
11849 i++;
11850 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011851 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011852
11853 j = len;
11854 if (striptype != LEFTSTRIP) {
11855 j--;
11856 while (j >= i) {
11857 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11858 if (!Py_UNICODE_ISSPACE(ch))
11859 break;
11860 j--;
11861 }
11862 j++;
11863 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011864 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011865
Victor Stinner7931d9a2011-11-04 00:22:48 +010011866 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867}
11868
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011869
11870static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011871do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011872{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011873 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011874
Benjamin Peterson14339b62009-01-31 16:36:08 +000011875 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11876 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011877
Benjamin Peterson14339b62009-01-31 16:36:08 +000011878 if (sep != NULL && sep != Py_None) {
11879 if (PyUnicode_Check(sep))
11880 return _PyUnicode_XStrip(self, striptype, sep);
11881 else {
11882 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 "%s arg must be None or str",
11884 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011885 return NULL;
11886 }
11887 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011888
Benjamin Peterson14339b62009-01-31 16:36:08 +000011889 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011890}
11891
11892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011893PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011894 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011895\n\
11896Return a copy of the string S with leading and trailing\n\
11897whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011898If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011899
11900static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011901unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011902{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011903 if (PyTuple_GET_SIZE(args) == 0)
11904 return do_strip(self, BOTHSTRIP); /* Common case */
11905 else
11906 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011907}
11908
11909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011910PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011912\n\
11913Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011914If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011915
11916static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011917unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011918{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011919 if (PyTuple_GET_SIZE(args) == 0)
11920 return do_strip(self, LEFTSTRIP); /* Common case */
11921 else
11922 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011923}
11924
11925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011926PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011927 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011928\n\
11929Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011930If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011931
11932static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011933unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011934{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011935 if (PyTuple_GET_SIZE(args) == 0)
11936 return do_strip(self, RIGHTSTRIP); /* Common case */
11937 else
11938 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011939}
11940
11941
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011943unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011945 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
Serhiy Storchaka05997252013-01-26 12:14:02 +020011948 if (len < 1)
11949 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950
Victor Stinnerc4b49542011-12-11 22:44:26 +010011951 /* no repeat, return original string */
11952 if (len == 1)
11953 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011954
Benjamin Petersonbac79492012-01-14 13:34:47 -050011955 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 return NULL;
11957
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011958 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011959 PyErr_SetString(PyExc_OverflowError,
11960 "repeated string is too long");
11961 return NULL;
11962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011964
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011965 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 if (!u)
11967 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011968 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_GET_LENGTH(str) == 1) {
11971 const int kind = PyUnicode_KIND(str);
11972 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011973 if (kind == PyUnicode_1BYTE_KIND) {
11974 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011975 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011976 }
11977 else if (kind == PyUnicode_2BYTE_KIND) {
11978 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011979 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011980 ucs2[n] = fill_char;
11981 } else {
11982 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11983 assert(kind == PyUnicode_4BYTE_KIND);
11984 for (n = 0; n < len; ++n)
11985 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 }
11988 else {
11989 /* number of characters copied this far */
11990 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011991 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 char *to = (char *) PyUnicode_DATA(u);
11993 Py_MEMCPY(to, PyUnicode_DATA(str),
11994 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 n = (done <= nchars-done) ? done : nchars-done;
11997 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011998 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000 }
12001
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012002 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012003 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004}
12005
Alexander Belopolsky40018472011-02-26 01:02:56 +000012006PyObject *
12007PyUnicode_Replace(PyObject *obj,
12008 PyObject *subobj,
12009 PyObject *replobj,
12010 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011{
12012 PyObject *self;
12013 PyObject *str1;
12014 PyObject *str2;
12015 PyObject *result;
12016
12017 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012018 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012021 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 Py_DECREF(self);
12023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 }
12025 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012026 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 Py_DECREF(self);
12028 Py_DECREF(str1);
12029 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012031 if (PyUnicode_READY(self) == -1 ||
12032 PyUnicode_READY(str1) == -1 ||
12033 PyUnicode_READY(str2) == -1)
12034 result = NULL;
12035 else
12036 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037 Py_DECREF(self);
12038 Py_DECREF(str1);
12039 Py_DECREF(str2);
12040 return result;
12041}
12042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012043PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012044 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045\n\
12046Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012047old replaced by new. If the optional argument count is\n\
12048given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
12050static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 PyObject *str1;
12054 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012055 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056 PyObject *result;
12057
Martin v. Löwis18e16552006-02-15 17:27:45 +000012058 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012060 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012063 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 return NULL;
12065 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012066 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012067 Py_DECREF(str1);
12068 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012069 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012070 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12071 result = NULL;
12072 else
12073 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074
12075 Py_DECREF(str1);
12076 Py_DECREF(str2);
12077 return result;
12078}
12079
Alexander Belopolsky40018472011-02-26 01:02:56 +000012080static PyObject *
12081unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012083 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 Py_ssize_t isize;
12085 Py_ssize_t osize, squote, dquote, i, o;
12086 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012087 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012091 return NULL;
12092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 isize = PyUnicode_GET_LENGTH(unicode);
12094 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 /* Compute length of output, quote characters, and
12097 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012098 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 max = 127;
12100 squote = dquote = 0;
12101 ikind = PyUnicode_KIND(unicode);
12102 for (i = 0; i < isize; i++) {
12103 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12104 switch (ch) {
12105 case '\'': squote++; osize++; break;
12106 case '"': dquote++; osize++; break;
12107 case '\\': case '\t': case '\r': case '\n':
12108 osize += 2; break;
12109 default:
12110 /* Fast-path ASCII */
12111 if (ch < ' ' || ch == 0x7f)
12112 osize += 4; /* \xHH */
12113 else if (ch < 0x7f)
12114 osize++;
12115 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12116 osize++;
12117 max = ch > max ? ch : max;
12118 }
12119 else if (ch < 0x100)
12120 osize += 4; /* \xHH */
12121 else if (ch < 0x10000)
12122 osize += 6; /* \uHHHH */
12123 else
12124 osize += 10; /* \uHHHHHHHH */
12125 }
12126 }
12127
12128 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012129 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012131 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 if (dquote)
12133 /* Both squote and dquote present. Use squote,
12134 and escape them */
12135 osize += squote;
12136 else
12137 quote = '"';
12138 }
Victor Stinner55c08782013-04-14 18:45:39 +020012139 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140
12141 repr = PyUnicode_New(osize, max);
12142 if (repr == NULL)
12143 return NULL;
12144 okind = PyUnicode_KIND(repr);
12145 odata = PyUnicode_DATA(repr);
12146
12147 PyUnicode_WRITE(okind, odata, 0, quote);
12148 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012149 if (unchanged) {
12150 _PyUnicode_FastCopyCharacters(repr, 1,
12151 unicode, 0,
12152 isize);
12153 }
12154 else {
12155 for (i = 0, o = 1; i < isize; i++) {
12156 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157
Victor Stinner55c08782013-04-14 18:45:39 +020012158 /* Escape quotes and backslashes */
12159 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012160 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012162 continue;
12163 }
12164
12165 /* Map special whitespace to '\t', \n', '\r' */
12166 if (ch == '\t') {
12167 PyUnicode_WRITE(okind, odata, o++, '\\');
12168 PyUnicode_WRITE(okind, odata, o++, 't');
12169 }
12170 else if (ch == '\n') {
12171 PyUnicode_WRITE(okind, odata, o++, '\\');
12172 PyUnicode_WRITE(okind, odata, o++, 'n');
12173 }
12174 else if (ch == '\r') {
12175 PyUnicode_WRITE(okind, odata, o++, '\\');
12176 PyUnicode_WRITE(okind, odata, o++, 'r');
12177 }
12178
12179 /* Map non-printable US ASCII to '\xhh' */
12180 else if (ch < ' ' || ch == 0x7F) {
12181 PyUnicode_WRITE(okind, odata, o++, '\\');
12182 PyUnicode_WRITE(okind, odata, o++, 'x');
12183 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12184 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12185 }
12186
12187 /* Copy ASCII characters as-is */
12188 else if (ch < 0x7F) {
12189 PyUnicode_WRITE(okind, odata, o++, ch);
12190 }
12191
12192 /* Non-ASCII characters */
12193 else {
12194 /* Map Unicode whitespace and control characters
12195 (categories Z* and C* except ASCII space)
12196 */
12197 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12198 PyUnicode_WRITE(okind, odata, o++, '\\');
12199 /* Map 8-bit characters to '\xhh' */
12200 if (ch <= 0xff) {
12201 PyUnicode_WRITE(okind, odata, o++, 'x');
12202 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12203 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12204 }
12205 /* Map 16-bit characters to '\uxxxx' */
12206 else if (ch <= 0xffff) {
12207 PyUnicode_WRITE(okind, odata, o++, 'u');
12208 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12209 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12210 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12211 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12212 }
12213 /* Map 21-bit characters to '\U00xxxxxx' */
12214 else {
12215 PyUnicode_WRITE(okind, odata, o++, 'U');
12216 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12217 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12218 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12219 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12220 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12221 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12222 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12223 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12224 }
12225 }
12226 /* Copy characters as-is */
12227 else {
12228 PyUnicode_WRITE(okind, odata, o++, ch);
12229 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012230 }
12231 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012234 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012235 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236}
12237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012238PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240\n\
12241Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012242such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243arguments start and end are interpreted as in slice notation.\n\
12244\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012245Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246
12247static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012250 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012251 Py_ssize_t start;
12252 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012253 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254
Jesus Ceaac451502011-04-20 17:09:23 +020012255 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12256 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258
Christian Heimesea71a522013-06-29 21:17:34 +020012259 if (PyUnicode_READY(self) == -1) {
12260 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012262 }
12263 if (PyUnicode_READY(substring) == -1) {
12264 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267
Victor Stinner7931d9a2011-11-04 00:22:48 +010012268 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269
12270 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 if (result == -2)
12273 return NULL;
12274
Christian Heimes217cfd12007-12-02 14:31:20 +000012275 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276}
12277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012278PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012281Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
12283static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012286 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012287 Py_ssize_t start;
12288 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012289 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290
Jesus Ceaac451502011-04-20 17:09:23 +020012291 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12292 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294
Christian Heimesea71a522013-06-29 21:17:34 +020012295 if (PyUnicode_READY(self) == -1) {
12296 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012298 }
12299 if (PyUnicode_READY(substring) == -1) {
12300 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303
Victor Stinner7931d9a2011-11-04 00:22:48 +010012304 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305
12306 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 if (result == -2)
12309 return NULL;
12310
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311 if (result < 0) {
12312 PyErr_SetString(PyExc_ValueError, "substring not found");
12313 return NULL;
12314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315
Christian Heimes217cfd12007-12-02 14:31:20 +000012316 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317}
12318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012319PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012322Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012323done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324
12325static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012326unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012328 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 Py_UCS4 fillchar = ' ';
12330
Victor Stinnere9a29352011-10-01 02:14:59 +020012331 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012333
Benjamin Petersonbac79492012-01-14 13:34:47 -050012334 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335 return NULL;
12336
Victor Stinnerc4b49542011-12-11 22:44:26 +010012337 if (PyUnicode_GET_LENGTH(self) >= width)
12338 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339
Victor Stinnerc4b49542011-12-11 22:44:26 +010012340 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341}
12342
Alexander Belopolsky40018472011-02-26 01:02:56 +000012343PyObject *
12344PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345{
12346 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012347
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348 s = PyUnicode_FromObject(s);
12349 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012350 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 if (sep != NULL) {
12352 sep = PyUnicode_FromObject(sep);
12353 if (sep == NULL) {
12354 Py_DECREF(s);
12355 return NULL;
12356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357 }
12358
Victor Stinner9310abb2011-10-05 00:59:23 +020012359 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360
12361 Py_DECREF(s);
12362 Py_XDECREF(sep);
12363 return result;
12364}
12365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012366PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012367 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368\n\
12369Return a list of the words in S, using sep as the\n\
12370delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012371splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012372whitespace string is a separator and empty strings are\n\
12373removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374
12375static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012376unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012378 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012380 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012382 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12383 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 return NULL;
12385
12386 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012389 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012391 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392}
12393
Thomas Wouters477c8d52006-05-27 19:21:47 +000012394PyObject *
12395PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12396{
12397 PyObject* str_obj;
12398 PyObject* sep_obj;
12399 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 int kind1, kind2, kind;
12401 void *buf1 = NULL, *buf2 = NULL;
12402 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012403
12404 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012405 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012406 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012407 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012408 if (!sep_obj) {
12409 Py_DECREF(str_obj);
12410 return NULL;
12411 }
12412 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12413 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012414 Py_DECREF(str_obj);
12415 return NULL;
12416 }
12417
Victor Stinner14f8f022011-10-05 20:58:25 +020012418 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012420 kind = Py_MAX(kind1, kind2);
12421 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012423 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 if (!buf1)
12425 goto onError;
12426 buf2 = PyUnicode_DATA(sep_obj);
12427 if (kind2 != kind)
12428 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12429 if (!buf2)
12430 goto onError;
12431 len1 = PyUnicode_GET_LENGTH(str_obj);
12432 len2 = PyUnicode_GET_LENGTH(sep_obj);
12433
Benjamin Petersonead6b532011-12-20 17:23:42 -060012434 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012436 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12437 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12438 else
12439 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 break;
12441 case PyUnicode_2BYTE_KIND:
12442 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12443 break;
12444 case PyUnicode_4BYTE_KIND:
12445 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12446 break;
12447 default:
12448 assert(0);
12449 out = 0;
12450 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451
12452 Py_DECREF(sep_obj);
12453 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 if (kind1 != kind)
12455 PyMem_Free(buf1);
12456 if (kind2 != kind)
12457 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012458
12459 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 onError:
12461 Py_DECREF(sep_obj);
12462 Py_DECREF(str_obj);
12463 if (kind1 != kind && buf1)
12464 PyMem_Free(buf1);
12465 if (kind2 != kind && buf2)
12466 PyMem_Free(buf2);
12467 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012468}
12469
12470
12471PyObject *
12472PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12473{
12474 PyObject* str_obj;
12475 PyObject* sep_obj;
12476 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 int kind1, kind2, kind;
12478 void *buf1 = NULL, *buf2 = NULL;
12479 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012480
12481 str_obj = PyUnicode_FromObject(str_in);
12482 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012484 sep_obj = PyUnicode_FromObject(sep_in);
12485 if (!sep_obj) {
12486 Py_DECREF(str_obj);
12487 return NULL;
12488 }
12489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 kind1 = PyUnicode_KIND(str_in);
12491 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012492 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 buf1 = PyUnicode_DATA(str_in);
12494 if (kind1 != kind)
12495 buf1 = _PyUnicode_AsKind(str_in, kind);
12496 if (!buf1)
12497 goto onError;
12498 buf2 = PyUnicode_DATA(sep_obj);
12499 if (kind2 != kind)
12500 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12501 if (!buf2)
12502 goto onError;
12503 len1 = PyUnicode_GET_LENGTH(str_obj);
12504 len2 = PyUnicode_GET_LENGTH(sep_obj);
12505
Benjamin Petersonead6b532011-12-20 17:23:42 -060012506 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012508 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12509 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12510 else
12511 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 break;
12513 case PyUnicode_2BYTE_KIND:
12514 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12515 break;
12516 case PyUnicode_4BYTE_KIND:
12517 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12518 break;
12519 default:
12520 assert(0);
12521 out = 0;
12522 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012523
12524 Py_DECREF(sep_obj);
12525 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 if (kind1 != kind)
12527 PyMem_Free(buf1);
12528 if (kind2 != kind)
12529 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012530
12531 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 onError:
12533 Py_DECREF(sep_obj);
12534 Py_DECREF(str_obj);
12535 if (kind1 != kind && buf1)
12536 PyMem_Free(buf1);
12537 if (kind2 != kind && buf2)
12538 PyMem_Free(buf2);
12539 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012540}
12541
12542PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012543 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012544\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012545Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012546the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012547found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012548
12549static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012550unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012551{
Victor Stinner9310abb2011-10-05 00:59:23 +020012552 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012553}
12554
12555PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012556 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012557\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012558Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012559the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012560separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012561
12562static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012563unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012564{
Victor Stinner9310abb2011-10-05 00:59:23 +020012565 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012566}
12567
Alexander Belopolsky40018472011-02-26 01:02:56 +000012568PyObject *
12569PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012570{
12571 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012572
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012573 s = PyUnicode_FromObject(s);
12574 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012575 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 if (sep != NULL) {
12577 sep = PyUnicode_FromObject(sep);
12578 if (sep == NULL) {
12579 Py_DECREF(s);
12580 return NULL;
12581 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012582 }
12583
Victor Stinner9310abb2011-10-05 00:59:23 +020012584 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012585
12586 Py_DECREF(s);
12587 Py_XDECREF(sep);
12588 return result;
12589}
12590
12591PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012592 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012593\n\
12594Return a list of the words in S, using sep as the\n\
12595delimiter string, starting at the end of the string and\n\
12596working to the front. If maxsplit is given, at most maxsplit\n\
12597splits are done. If sep is not specified, any whitespace string\n\
12598is a separator.");
12599
12600static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012601unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012602{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012603 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012604 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012605 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012606
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012607 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12608 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012609 return NULL;
12610
12611 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012613 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012614 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012615 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012616 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012617}
12618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012619PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621\n\
12622Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012623Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012624is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625
12626static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012627unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012629 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012630 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012632 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12633 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634 return NULL;
12635
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012636 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637}
12638
12639static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012640PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012642 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643}
12644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012645PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647\n\
12648Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012649and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
12651static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012652unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012654 if (PyUnicode_READY(self) == -1)
12655 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012656 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657}
12658
Georg Brandlceee0772007-11-27 23:48:05 +000012659PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012661\n\
12662Return a translation table usable for str.translate().\n\
12663If there is only one argument, it must be a dictionary mapping Unicode\n\
12664ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012665Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012666If there are two arguments, they must be strings of equal length, and\n\
12667in the resulting dictionary, each character in x will be mapped to the\n\
12668character at the same position in y. If there is a third argument, it\n\
12669must be a string, whose characters will be mapped to None in the result.");
12670
12671static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012672unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012673{
12674 PyObject *x, *y = NULL, *z = NULL;
12675 PyObject *new = NULL, *key, *value;
12676 Py_ssize_t i = 0;
12677 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012678
Georg Brandlceee0772007-11-27 23:48:05 +000012679 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12680 return NULL;
12681 new = PyDict_New();
12682 if (!new)
12683 return NULL;
12684 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 int x_kind, y_kind, z_kind;
12686 void *x_data, *y_data, *z_data;
12687
Georg Brandlceee0772007-11-27 23:48:05 +000012688 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012689 if (!PyUnicode_Check(x)) {
12690 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12691 "be a string if there is a second argument");
12692 goto err;
12693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012695 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12696 "arguments must have equal length");
12697 goto err;
12698 }
12699 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 x_kind = PyUnicode_KIND(x);
12701 y_kind = PyUnicode_KIND(y);
12702 x_data = PyUnicode_DATA(x);
12703 y_data = PyUnicode_DATA(y);
12704 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12705 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012706 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012707 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012708 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012709 if (!value) {
12710 Py_DECREF(key);
12711 goto err;
12712 }
Georg Brandlceee0772007-11-27 23:48:05 +000012713 res = PyDict_SetItem(new, key, value);
12714 Py_DECREF(key);
12715 Py_DECREF(value);
12716 if (res < 0)
12717 goto err;
12718 }
12719 /* create entries for deleting chars in z */
12720 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 z_kind = PyUnicode_KIND(z);
12722 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012723 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012725 if (!key)
12726 goto err;
12727 res = PyDict_SetItem(new, key, Py_None);
12728 Py_DECREF(key);
12729 if (res < 0)
12730 goto err;
12731 }
12732 }
12733 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 int kind;
12735 void *data;
12736
Georg Brandlceee0772007-11-27 23:48:05 +000012737 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012738 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012739 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12740 "to maketrans it must be a dict");
12741 goto err;
12742 }
12743 /* copy entries into the new dict, converting string keys to int keys */
12744 while (PyDict_Next(x, &i, &key, &value)) {
12745 if (PyUnicode_Check(key)) {
12746 /* convert string keys to integer keys */
12747 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012748 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012749 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12750 "table must be of length 1");
12751 goto err;
12752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 kind = PyUnicode_KIND(key);
12754 data = PyUnicode_DATA(key);
12755 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012756 if (!newkey)
12757 goto err;
12758 res = PyDict_SetItem(new, newkey, value);
12759 Py_DECREF(newkey);
12760 if (res < 0)
12761 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012762 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012763 /* just keep integer keys */
12764 if (PyDict_SetItem(new, key, value) < 0)
12765 goto err;
12766 } else {
12767 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12768 "be strings or integers");
12769 goto err;
12770 }
12771 }
12772 }
12773 return new;
12774 err:
12775 Py_DECREF(new);
12776 return NULL;
12777}
12778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012779PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781\n\
12782Return a copy of the string S, where all characters have been mapped\n\
12783through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012784Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012785Unmapped characters are left untouched. Characters mapped to None\n\
12786are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787
12788static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792}
12793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012794PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012795 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012797Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798
12799static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012800unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012802 if (PyUnicode_READY(self) == -1)
12803 return NULL;
12804 if (PyUnicode_IS_ASCII(self))
12805 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012806 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807}
12808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012809PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012812Pad a numeric string S with zeros on the left, to fill a field\n\
12813of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814
12815static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012816unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012818 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012819 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012820 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 int kind;
12822 void *data;
12823 Py_UCS4 chr;
12824
Martin v. Löwis18e16552006-02-15 17:27:45 +000012825 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826 return NULL;
12827
Benjamin Petersonbac79492012-01-14 13:34:47 -050012828 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830
Victor Stinnerc4b49542011-12-11 22:44:26 +010012831 if (PyUnicode_GET_LENGTH(self) >= width)
12832 return unicode_result_unchanged(self);
12833
12834 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835
12836 u = pad(self, fill, 0, '0');
12837
Walter Dörwald068325e2002-04-15 13:36:47 +000012838 if (u == NULL)
12839 return NULL;
12840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 kind = PyUnicode_KIND(u);
12842 data = PyUnicode_DATA(u);
12843 chr = PyUnicode_READ(kind, data, fill);
12844
12845 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 PyUnicode_WRITE(kind, data, 0, chr);
12848 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849 }
12850
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012851 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012852 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854
12855#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012856static PyObject *
12857unicode__decimal2ascii(PyObject *self)
12858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012859 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012860}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861#endif
12862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012863PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012866Return True if S starts with the specified prefix, False otherwise.\n\
12867With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012868With optional end, stop comparing S at that position.\n\
12869prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870
12871static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012872unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012875 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012876 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012877 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012878 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012879 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880
Jesus Ceaac451502011-04-20 17:09:23 +020012881 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012882 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012883 if (PyTuple_Check(subobj)) {
12884 Py_ssize_t i;
12885 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012886 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012887 if (substring == NULL)
12888 return NULL;
12889 result = tailmatch(self, substring, start, end, -1);
12890 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012891 if (result == -1)
12892 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012893 if (result) {
12894 Py_RETURN_TRUE;
12895 }
12896 }
12897 /* nothing matched */
12898 Py_RETURN_FALSE;
12899 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012900 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012901 if (substring == NULL) {
12902 if (PyErr_ExceptionMatches(PyExc_TypeError))
12903 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12904 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012905 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012906 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012907 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012909 if (result == -1)
12910 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012911 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912}
12913
12914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012915PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012916 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012918Return True if S ends with the specified suffix, False otherwise.\n\
12919With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012920With optional end, stop comparing S at that position.\n\
12921suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922
12923static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012924unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012925 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012927 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012928 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012929 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012930 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012931 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932
Jesus Ceaac451502011-04-20 17:09:23 +020012933 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012935 if (PyTuple_Check(subobj)) {
12936 Py_ssize_t i;
12937 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012938 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012939 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012940 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012941 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012942 result = tailmatch(self, substring, start, end, +1);
12943 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012944 if (result == -1)
12945 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012946 if (result) {
12947 Py_RETURN_TRUE;
12948 }
12949 }
12950 Py_RETURN_FALSE;
12951 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012952 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012953 if (substring == NULL) {
12954 if (PyErr_ExceptionMatches(PyExc_TypeError))
12955 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12956 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012958 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012959 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020012960 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012961 if (result == -1)
12962 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012963 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964}
12965
Victor Stinner202fdca2012-05-07 12:47:02 +020012966Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012967_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012968{
Victor Stinner8f674cc2013-04-17 23:02:17 +020012969 if (!writer->readonly)
12970 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12971 else {
12972 /* Copy-on-write mode: set buffer size to 0 so
12973 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
12974 * next write. */
12975 writer->size = 0;
12976 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012977 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12978 writer->data = PyUnicode_DATA(writer->buffer);
12979 writer->kind = PyUnicode_KIND(writer->buffer);
12980}
12981
Victor Stinnerd3f08822012-05-29 12:57:52 +020012982void
Victor Stinner8f674cc2013-04-17 23:02:17 +020012983_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012984{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012985 memset(writer, 0, sizeof(*writer));
12986#ifdef Py_DEBUG
12987 writer->kind = 5; /* invalid kind */
12988#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020012989 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020012990}
12991
Victor Stinnerd3f08822012-05-29 12:57:52 +020012992int
12993_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12994 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012995{
12996 Py_ssize_t newlen;
12997 PyObject *newbuffer;
12998
Victor Stinnerd3f08822012-05-29 12:57:52 +020012999 assert(length > 0);
13000
Victor Stinner202fdca2012-05-07 12:47:02 +020013001 if (length > PY_SSIZE_T_MAX - writer->pos) {
13002 PyErr_NoMemory();
13003 return -1;
13004 }
13005 newlen = writer->pos + length;
13006
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013007 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013008
Victor Stinnerd3f08822012-05-29 12:57:52 +020013009 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013010 assert(!writer->readonly);
13011 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013012 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013013 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013014 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013015 if (newlen < writer->min_length)
13016 newlen = writer->min_length;
13017
Victor Stinnerd3f08822012-05-29 12:57:52 +020013018 writer->buffer = PyUnicode_New(newlen, maxchar);
13019 if (writer->buffer == NULL)
13020 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013021 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013022 else if (newlen > writer->size) {
13023 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013024 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013025 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013026 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013027 if (newlen < writer->min_length)
13028 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013029
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013030 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013031 /* resize + widen */
13032 newbuffer = PyUnicode_New(newlen, maxchar);
13033 if (newbuffer == NULL)
13034 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013035 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13036 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013037 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013038 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013039 }
13040 else {
13041 newbuffer = resize_compact(writer->buffer, newlen);
13042 if (newbuffer == NULL)
13043 return -1;
13044 }
13045 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013046 }
13047 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013048 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013049 newbuffer = PyUnicode_New(writer->size, maxchar);
13050 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013051 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013052 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13053 writer->buffer, 0, writer->pos);
13054 Py_DECREF(writer->buffer);
13055 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013056 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013057 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013058 return 0;
13059}
13060
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013061Py_LOCAL_INLINE(int)
13062_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013063{
13064 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13065 return -1;
13066 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13067 writer->pos++;
13068 return 0;
13069}
13070
13071int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013072_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13073{
13074 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13075}
13076
13077int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013078_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13079{
13080 Py_UCS4 maxchar;
13081 Py_ssize_t len;
13082
13083 if (PyUnicode_READY(str) == -1)
13084 return -1;
13085 len = PyUnicode_GET_LENGTH(str);
13086 if (len == 0)
13087 return 0;
13088 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13089 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013090 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013091 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013092 Py_INCREF(str);
13093 writer->buffer = str;
13094 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013095 writer->pos += len;
13096 return 0;
13097 }
13098 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13099 return -1;
13100 }
13101 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13102 str, 0, len);
13103 writer->pos += len;
13104 return 0;
13105}
13106
Victor Stinnere215d962012-10-06 23:03:36 +020013107int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013108_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13109 Py_ssize_t start, Py_ssize_t end)
13110{
13111 Py_UCS4 maxchar;
13112 Py_ssize_t len;
13113
13114 if (PyUnicode_READY(str) == -1)
13115 return -1;
13116
13117 assert(0 <= start);
13118 assert(end <= PyUnicode_GET_LENGTH(str));
13119 assert(start <= end);
13120
13121 if (end == 0)
13122 return 0;
13123
13124 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13125 return _PyUnicodeWriter_WriteStr(writer, str);
13126
13127 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13128 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13129 else
13130 maxchar = writer->maxchar;
13131 len = end - start;
13132
13133 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13134 return -1;
13135
13136 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13137 str, start, len);
13138 writer->pos += len;
13139 return 0;
13140}
13141
13142int
Victor Stinnere215d962012-10-06 23:03:36 +020013143_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13144{
13145 Py_UCS4 maxchar;
13146
13147 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13148 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13149 return -1;
13150 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13151 writer->pos += len;
13152 return 0;
13153}
13154
Victor Stinnerd3f08822012-05-29 12:57:52 +020013155PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013156_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013157{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013158 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013159 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013160 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013161 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013162 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013163 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013164 str = writer->buffer;
13165 writer->buffer = NULL;
13166 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13167 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013168 }
13169 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13170 PyObject *newbuffer;
13171 newbuffer = resize_compact(writer->buffer, writer->pos);
13172 if (newbuffer == NULL) {
13173 Py_DECREF(writer->buffer);
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013174 writer->buffer = NULL;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013175 return NULL;
13176 }
13177 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013178 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013179 str = writer->buffer;
13180 writer->buffer = NULL;
13181 assert(_PyUnicode_CheckConsistency(str, 1));
13182 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013183}
13184
Victor Stinnerd3f08822012-05-29 12:57:52 +020013185void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013186_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013187{
13188 Py_CLEAR(writer->buffer);
13189}
13190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013192
13193PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013195\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013196Return a formatted version of S, using substitutions from args and kwargs.\n\
13197The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013198
Eric Smith27bbca62010-11-04 17:06:58 +000013199PyDoc_STRVAR(format_map__doc__,
13200 "S.format_map(mapping) -> str\n\
13201\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013202Return a formatted version of S, using substitutions from mapping.\n\
13203The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013204
Eric Smith4a7d76d2008-05-30 18:10:19 +000013205static PyObject *
13206unicode__format__(PyObject* self, PyObject* args)
13207{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013208 PyObject *format_spec;
13209 _PyUnicodeWriter writer;
13210 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013211
13212 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13213 return NULL;
13214
Victor Stinnerd3f08822012-05-29 12:57:52 +020013215 if (PyUnicode_READY(self) == -1)
13216 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013217 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013218 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13219 self, format_spec, 0,
13220 PyUnicode_GET_LENGTH(format_spec));
13221 if (ret == -1) {
13222 _PyUnicodeWriter_Dealloc(&writer);
13223 return NULL;
13224 }
13225 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013226}
13227
Eric Smith8c663262007-08-25 02:26:07 +000013228PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013230\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013231Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013232
13233static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013234unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 Py_ssize_t size;
13237
13238 /* If it's a compact object, account for base structure +
13239 character data. */
13240 if (PyUnicode_IS_COMPACT_ASCII(v))
13241 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13242 else if (PyUnicode_IS_COMPACT(v))
13243 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013244 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 else {
13246 /* If it is a two-block object, account for base object, and
13247 for character block if present. */
13248 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013249 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013251 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252 }
13253 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013254 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013255 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013257 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013258 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013259
13260 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013261}
13262
13263PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013264 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013265
13266static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013267unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013268{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013269 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 if (!copy)
13271 return NULL;
13272 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013273}
13274
Guido van Rossumd57fd912000-03-10 22:53:23 +000013275static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013276 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013277 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013278 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13279 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013280 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13281 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013282 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013283 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13284 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13285 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13286 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13287 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013288 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013289 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13290 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13291 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013292 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013293 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13294 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13295 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013296 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013297 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013298 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013299 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013300 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13301 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13302 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13303 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13304 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13305 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13306 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13307 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13308 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13309 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13310 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13311 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13312 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13313 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013314 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013315 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013316 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013317 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013318 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013319 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013320 {"maketrans", (PyCFunction) unicode_maketrans,
13321 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013322 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013323#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013324 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013325 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326#endif
13327
Benjamin Peterson14339b62009-01-31 16:36:08 +000013328 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013329 {NULL, NULL}
13330};
13331
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013332static PyObject *
13333unicode_mod(PyObject *v, PyObject *w)
13334{
Brian Curtindfc80e32011-08-10 20:28:54 -050013335 if (!PyUnicode_Check(v))
13336 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013337 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013338}
13339
13340static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013341 0, /*nb_add*/
13342 0, /*nb_subtract*/
13343 0, /*nb_multiply*/
13344 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013345};
13346
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013348 (lenfunc) unicode_length, /* sq_length */
13349 PyUnicode_Concat, /* sq_concat */
13350 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13351 (ssizeargfunc) unicode_getitem, /* sq_item */
13352 0, /* sq_slice */
13353 0, /* sq_ass_item */
13354 0, /* sq_ass_slice */
13355 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356};
13357
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013358static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013359unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361 if (PyUnicode_READY(self) == -1)
13362 return NULL;
13363
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013364 if (PyIndex_Check(item)) {
13365 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013366 if (i == -1 && PyErr_Occurred())
13367 return NULL;
13368 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013369 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013370 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013371 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013372 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013373 PyObject *result;
13374 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013375 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013376 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013378 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013379 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013380 return NULL;
13381 }
13382
13383 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013384 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013385 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013386 slicelength == PyUnicode_GET_LENGTH(self)) {
13387 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013388 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013389 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013390 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013391 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013392 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013393 src_kind = PyUnicode_KIND(self);
13394 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013395 if (!PyUnicode_IS_ASCII(self)) {
13396 kind_limit = kind_maxchar_limit(src_kind);
13397 max_char = 0;
13398 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13399 ch = PyUnicode_READ(src_kind, src_data, cur);
13400 if (ch > max_char) {
13401 max_char = ch;
13402 if (max_char >= kind_limit)
13403 break;
13404 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013405 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013406 }
Victor Stinner55c99112011-10-13 01:17:06 +020013407 else
13408 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013409 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013410 if (result == NULL)
13411 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013412 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013413 dest_data = PyUnicode_DATA(result);
13414
13415 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013416 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13417 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013418 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013419 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013420 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013421 } else {
13422 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13423 return NULL;
13424 }
13425}
13426
13427static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013428 (lenfunc)unicode_length, /* mp_length */
13429 (binaryfunc)unicode_subscript, /* mp_subscript */
13430 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013431};
13432
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434/* Helpers for PyUnicode_Format() */
13435
Victor Stinnera47082312012-10-04 02:19:54 +020013436struct unicode_formatter_t {
13437 PyObject *args;
13438 int args_owned;
13439 Py_ssize_t arglen, argidx;
13440 PyObject *dict;
13441
13442 enum PyUnicode_Kind fmtkind;
13443 Py_ssize_t fmtcnt, fmtpos;
13444 void *fmtdata;
13445 PyObject *fmtstr;
13446
13447 _PyUnicodeWriter writer;
13448};
13449
13450struct unicode_format_arg_t {
13451 Py_UCS4 ch;
13452 int flags;
13453 Py_ssize_t width;
13454 int prec;
13455 int sign;
13456};
13457
Guido van Rossumd57fd912000-03-10 22:53:23 +000013458static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013459unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013460{
Victor Stinnera47082312012-10-04 02:19:54 +020013461 Py_ssize_t argidx = ctx->argidx;
13462
13463 if (argidx < ctx->arglen) {
13464 ctx->argidx++;
13465 if (ctx->arglen < 0)
13466 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013467 else
Victor Stinnera47082312012-10-04 02:19:54 +020013468 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013469 }
13470 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013471 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013472 return NULL;
13473}
13474
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013475/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476
Victor Stinnera47082312012-10-04 02:19:54 +020013477/* Format a float into the writer if the writer is not NULL, or into *p_output
13478 otherwise.
13479
13480 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013481static int
Victor Stinnera47082312012-10-04 02:19:54 +020013482formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13483 PyObject **p_output,
13484 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013485{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013486 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013487 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013488 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013489 int prec;
13490 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013491
Guido van Rossumd57fd912000-03-10 22:53:23 +000013492 x = PyFloat_AsDouble(v);
13493 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013494 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013495
Victor Stinnera47082312012-10-04 02:19:54 +020013496 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013497 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013498 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013499
Victor Stinnera47082312012-10-04 02:19:54 +020013500 if (arg->flags & F_ALT)
13501 dtoa_flags = Py_DTSF_ALT;
13502 else
13503 dtoa_flags = 0;
13504 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013505 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506 return -1;
13507 len = strlen(p);
13508 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013509 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13510 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013511 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013512 }
Victor Stinner184252a2012-06-16 02:57:41 +020013513 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013514 writer->pos += len;
13515 }
13516 else
13517 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013518 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013520}
13521
Victor Stinnerd0880d52012-04-27 23:40:13 +020013522/* formatlong() emulates the format codes d, u, o, x and X, and
13523 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13524 * Python's regular ints.
13525 * Return value: a new PyUnicodeObject*, or NULL if error.
13526 * The output string is of the form
13527 * "-"? ("0x" | "0X")? digit+
13528 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13529 * set in flags. The case of hex digits will be correct,
13530 * There will be at least prec digits, zero-filled on the left if
13531 * necessary to get that many.
13532 * val object to be converted
13533 * flags bitmask of format flags; only F_ALT is looked at
13534 * prec minimum number of digits; 0-fill on left if needed
13535 * type a character in [duoxX]; u acts the same as d
13536 *
13537 * CAUTION: o, x and X conversions on regular ints can never
13538 * produce a '-' sign, but can for Python's unbounded ints.
13539 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013540static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013541formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013542{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013543 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013544 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013545 Py_ssize_t i;
13546 int sign; /* 1 if '-', else 0 */
13547 int len; /* number of characters */
13548 Py_ssize_t llen;
13549 int numdigits; /* len == numnondigits + numdigits */
13550 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013551 int prec = arg->prec;
13552 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013553
Victor Stinnerd0880d52012-04-27 23:40:13 +020013554 /* Avoid exceeding SSIZE_T_MAX */
13555 if (prec > INT_MAX-3) {
13556 PyErr_SetString(PyExc_OverflowError,
13557 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013558 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013559 }
13560
13561 assert(PyLong_Check(val));
13562
13563 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013564 default:
13565 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013566 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013567 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013568 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013569 /* int and int subclasses should print numerically when a numeric */
13570 /* format code is used (see issue18780) */
13571 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013572 break;
13573 case 'o':
13574 numnondigits = 2;
13575 result = PyNumber_ToBase(val, 8);
13576 break;
13577 case 'x':
13578 case 'X':
13579 numnondigits = 2;
13580 result = PyNumber_ToBase(val, 16);
13581 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013582 }
13583 if (!result)
13584 return NULL;
13585
13586 assert(unicode_modifiable(result));
13587 assert(PyUnicode_IS_READY(result));
13588 assert(PyUnicode_IS_ASCII(result));
13589
13590 /* To modify the string in-place, there can only be one reference. */
13591 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013592 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013593 PyErr_BadInternalCall();
13594 return NULL;
13595 }
13596 buf = PyUnicode_DATA(result);
13597 llen = PyUnicode_GET_LENGTH(result);
13598 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013599 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013600 PyErr_SetString(PyExc_ValueError,
13601 "string too large in _PyBytes_FormatLong");
13602 return NULL;
13603 }
13604 len = (int)llen;
13605 sign = buf[0] == '-';
13606 numnondigits += sign;
13607 numdigits = len - numnondigits;
13608 assert(numdigits > 0);
13609
13610 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013611 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013612 (type == 'o' || type == 'x' || type == 'X'))) {
13613 assert(buf[sign] == '0');
13614 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13615 buf[sign+1] == 'o');
13616 numnondigits -= 2;
13617 buf += 2;
13618 len -= 2;
13619 if (sign)
13620 buf[0] = '-';
13621 assert(len == numnondigits + numdigits);
13622 assert(numdigits > 0);
13623 }
13624
13625 /* Fill with leading zeroes to meet minimum width. */
13626 if (prec > numdigits) {
13627 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13628 numnondigits + prec);
13629 char *b1;
13630 if (!r1) {
13631 Py_DECREF(result);
13632 return NULL;
13633 }
13634 b1 = PyBytes_AS_STRING(r1);
13635 for (i = 0; i < numnondigits; ++i)
13636 *b1++ = *buf++;
13637 for (i = 0; i < prec - numdigits; i++)
13638 *b1++ = '0';
13639 for (i = 0; i < numdigits; i++)
13640 *b1++ = *buf++;
13641 *b1 = '\0';
13642 Py_DECREF(result);
13643 result = r1;
13644 buf = PyBytes_AS_STRING(result);
13645 len = numnondigits + prec;
13646 }
13647
13648 /* Fix up case for hex conversions. */
13649 if (type == 'X') {
13650 /* Need to convert all lower case letters to upper case.
13651 and need to convert 0x to 0X (and -0x to -0X). */
13652 for (i = 0; i < len; i++)
13653 if (buf[i] >= 'a' && buf[i] <= 'x')
13654 buf[i] -= 'a'-'A';
13655 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013656 if (!PyUnicode_Check(result)
13657 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013658 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013659 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013660 Py_DECREF(result);
13661 result = unicode;
13662 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013663 else if (len != PyUnicode_GET_LENGTH(result)) {
13664 if (PyUnicode_Resize(&result, len) < 0)
13665 Py_CLEAR(result);
13666 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013667 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013668}
13669
Victor Stinner621ef3d2012-10-02 00:33:47 +020013670/* Format an integer.
13671 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013672 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013673 * -1 and raise an exception on error */
13674static int
Victor Stinnera47082312012-10-04 02:19:54 +020013675mainformatlong(PyObject *v,
13676 struct unicode_format_arg_t *arg,
13677 PyObject **p_output,
13678 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013679{
13680 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013681 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013682
13683 if (!PyNumber_Check(v))
13684 goto wrongtype;
13685
13686 if (!PyLong_Check(v)) {
13687 iobj = PyNumber_Long(v);
13688 if (iobj == NULL) {
13689 if (PyErr_ExceptionMatches(PyExc_TypeError))
13690 goto wrongtype;
13691 return -1;
13692 }
13693 assert(PyLong_Check(iobj));
13694 }
13695 else {
13696 iobj = v;
13697 Py_INCREF(iobj);
13698 }
13699
13700 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013701 && arg->width == -1 && arg->prec == -1
13702 && !(arg->flags & (F_SIGN | F_BLANK))
13703 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013704 {
13705 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013706 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013707 int base;
13708
Victor Stinnera47082312012-10-04 02:19:54 +020013709 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013710 {
13711 default:
13712 assert(0 && "'type' not in [diuoxX]");
13713 case 'd':
13714 case 'i':
13715 case 'u':
13716 base = 10;
13717 break;
13718 case 'o':
13719 base = 8;
13720 break;
13721 case 'x':
13722 case 'X':
13723 base = 16;
13724 break;
13725 }
13726
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013727 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13728 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013729 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013730 }
13731 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013732 return 1;
13733 }
13734
Victor Stinnera47082312012-10-04 02:19:54 +020013735 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013736 Py_DECREF(iobj);
13737 if (res == NULL)
13738 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013739 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013740 return 0;
13741
13742wrongtype:
13743 PyErr_Format(PyExc_TypeError,
13744 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013745 "not %.200s",
13746 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013747 return -1;
13748}
13749
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013750static Py_UCS4
13751formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013752{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013753 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013754 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013755 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013756 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013757 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 goto onError;
13759 }
13760 else {
13761 /* Integer input truncated to a character */
13762 long x;
13763 x = PyLong_AsLong(v);
13764 if (x == -1 && PyErr_Occurred())
13765 goto onError;
13766
Victor Stinner8faf8212011-12-08 22:14:11 +010013767 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013768 PyErr_SetString(PyExc_OverflowError,
13769 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013770 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013771 }
13772
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013773 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013774 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013775
Benjamin Peterson29060642009-01-31 22:14:21 +000013776 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013777 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013778 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013779 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013780}
13781
Victor Stinnera47082312012-10-04 02:19:54 +020013782/* Parse options of an argument: flags, width, precision.
13783 Handle also "%(name)" syntax.
13784
13785 Return 0 if the argument has been formatted into arg->str.
13786 Return 1 if the argument has been written into ctx->writer,
13787 Raise an exception and return -1 on error. */
13788static int
13789unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13790 struct unicode_format_arg_t *arg)
13791{
13792#define FORMAT_READ(ctx) \
13793 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13794
13795 PyObject *v;
13796
Victor Stinnera47082312012-10-04 02:19:54 +020013797 if (arg->ch == '(') {
13798 /* Get argument value from a dictionary. Example: "%(name)s". */
13799 Py_ssize_t keystart;
13800 Py_ssize_t keylen;
13801 PyObject *key;
13802 int pcount = 1;
13803
13804 if (ctx->dict == NULL) {
13805 PyErr_SetString(PyExc_TypeError,
13806 "format requires a mapping");
13807 return -1;
13808 }
13809 ++ctx->fmtpos;
13810 --ctx->fmtcnt;
13811 keystart = ctx->fmtpos;
13812 /* Skip over balanced parentheses */
13813 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13814 arg->ch = FORMAT_READ(ctx);
13815 if (arg->ch == ')')
13816 --pcount;
13817 else if (arg->ch == '(')
13818 ++pcount;
13819 ctx->fmtpos++;
13820 }
13821 keylen = ctx->fmtpos - keystart - 1;
13822 if (ctx->fmtcnt < 0 || pcount > 0) {
13823 PyErr_SetString(PyExc_ValueError,
13824 "incomplete format key");
13825 return -1;
13826 }
13827 key = PyUnicode_Substring(ctx->fmtstr,
13828 keystart, keystart + keylen);
13829 if (key == NULL)
13830 return -1;
13831 if (ctx->args_owned) {
13832 Py_DECREF(ctx->args);
13833 ctx->args_owned = 0;
13834 }
13835 ctx->args = PyObject_GetItem(ctx->dict, key);
13836 Py_DECREF(key);
13837 if (ctx->args == NULL)
13838 return -1;
13839 ctx->args_owned = 1;
13840 ctx->arglen = -1;
13841 ctx->argidx = -2;
13842 }
13843
13844 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013845 while (--ctx->fmtcnt >= 0) {
13846 arg->ch = FORMAT_READ(ctx);
13847 ctx->fmtpos++;
13848 switch (arg->ch) {
13849 case '-': arg->flags |= F_LJUST; continue;
13850 case '+': arg->flags |= F_SIGN; continue;
13851 case ' ': arg->flags |= F_BLANK; continue;
13852 case '#': arg->flags |= F_ALT; continue;
13853 case '0': arg->flags |= F_ZERO; continue;
13854 }
13855 break;
13856 }
13857
13858 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013859 if (arg->ch == '*') {
13860 v = unicode_format_getnextarg(ctx);
13861 if (v == NULL)
13862 return -1;
13863 if (!PyLong_Check(v)) {
13864 PyErr_SetString(PyExc_TypeError,
13865 "* wants int");
13866 return -1;
13867 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013868 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013869 if (arg->width == -1 && PyErr_Occurred())
13870 return -1;
13871 if (arg->width < 0) {
13872 arg->flags |= F_LJUST;
13873 arg->width = -arg->width;
13874 }
13875 if (--ctx->fmtcnt >= 0) {
13876 arg->ch = FORMAT_READ(ctx);
13877 ctx->fmtpos++;
13878 }
13879 }
13880 else if (arg->ch >= '0' && arg->ch <= '9') {
13881 arg->width = arg->ch - '0';
13882 while (--ctx->fmtcnt >= 0) {
13883 arg->ch = FORMAT_READ(ctx);
13884 ctx->fmtpos++;
13885 if (arg->ch < '0' || arg->ch > '9')
13886 break;
13887 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13888 mixing signed and unsigned comparison. Since arg->ch is between
13889 '0' and '9', casting to int is safe. */
13890 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13891 PyErr_SetString(PyExc_ValueError,
13892 "width too big");
13893 return -1;
13894 }
13895 arg->width = arg->width*10 + (arg->ch - '0');
13896 }
13897 }
13898
13899 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013900 if (arg->ch == '.') {
13901 arg->prec = 0;
13902 if (--ctx->fmtcnt >= 0) {
13903 arg->ch = FORMAT_READ(ctx);
13904 ctx->fmtpos++;
13905 }
13906 if (arg->ch == '*') {
13907 v = unicode_format_getnextarg(ctx);
13908 if (v == NULL)
13909 return -1;
13910 if (!PyLong_Check(v)) {
13911 PyErr_SetString(PyExc_TypeError,
13912 "* wants int");
13913 return -1;
13914 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013915 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013916 if (arg->prec == -1 && PyErr_Occurred())
13917 return -1;
13918 if (arg->prec < 0)
13919 arg->prec = 0;
13920 if (--ctx->fmtcnt >= 0) {
13921 arg->ch = FORMAT_READ(ctx);
13922 ctx->fmtpos++;
13923 }
13924 }
13925 else if (arg->ch >= '0' && arg->ch <= '9') {
13926 arg->prec = arg->ch - '0';
13927 while (--ctx->fmtcnt >= 0) {
13928 arg->ch = FORMAT_READ(ctx);
13929 ctx->fmtpos++;
13930 if (arg->ch < '0' || arg->ch > '9')
13931 break;
13932 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13933 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013934 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013935 return -1;
13936 }
13937 arg->prec = arg->prec*10 + (arg->ch - '0');
13938 }
13939 }
13940 }
13941
13942 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13943 if (ctx->fmtcnt >= 0) {
13944 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13945 if (--ctx->fmtcnt >= 0) {
13946 arg->ch = FORMAT_READ(ctx);
13947 ctx->fmtpos++;
13948 }
13949 }
13950 }
13951 if (ctx->fmtcnt < 0) {
13952 PyErr_SetString(PyExc_ValueError,
13953 "incomplete format");
13954 return -1;
13955 }
13956 return 0;
13957
13958#undef FORMAT_READ
13959}
13960
13961/* Format one argument. Supported conversion specifiers:
13962
13963 - "s", "r", "a": any type
13964 - "i", "d", "u", "o", "x", "X": int
13965 - "e", "E", "f", "F", "g", "G": float
13966 - "c": int or str (1 character)
13967
Victor Stinner8dbd4212012-12-04 09:30:24 +010013968 When possible, the output is written directly into the Unicode writer
13969 (ctx->writer). A string is created when padding is required.
13970
Victor Stinnera47082312012-10-04 02:19:54 +020013971 Return 0 if the argument has been formatted into *p_str,
13972 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013973 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013974static int
13975unicode_format_arg_format(struct unicode_formatter_t *ctx,
13976 struct unicode_format_arg_t *arg,
13977 PyObject **p_str)
13978{
13979 PyObject *v;
13980 _PyUnicodeWriter *writer = &ctx->writer;
13981
13982 if (ctx->fmtcnt == 0)
13983 ctx->writer.overallocate = 0;
13984
13985 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013986 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020013987 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013988 return 1;
13989 }
13990
13991 v = unicode_format_getnextarg(ctx);
13992 if (v == NULL)
13993 return -1;
13994
Victor Stinnera47082312012-10-04 02:19:54 +020013995
13996 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013997 case 's':
13998 case 'r':
13999 case 'a':
14000 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14001 /* Fast path */
14002 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14003 return -1;
14004 return 1;
14005 }
14006
14007 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14008 *p_str = v;
14009 Py_INCREF(*p_str);
14010 }
14011 else {
14012 if (arg->ch == 's')
14013 *p_str = PyObject_Str(v);
14014 else if (arg->ch == 'r')
14015 *p_str = PyObject_Repr(v);
14016 else
14017 *p_str = PyObject_ASCII(v);
14018 }
14019 break;
14020
14021 case 'i':
14022 case 'd':
14023 case 'u':
14024 case 'o':
14025 case 'x':
14026 case 'X':
14027 {
14028 int ret = mainformatlong(v, arg, p_str, writer);
14029 if (ret != 0)
14030 return ret;
14031 arg->sign = 1;
14032 break;
14033 }
14034
14035 case 'e':
14036 case 'E':
14037 case 'f':
14038 case 'F':
14039 case 'g':
14040 case 'G':
14041 if (arg->width == -1 && arg->prec == -1
14042 && !(arg->flags & (F_SIGN | F_BLANK)))
14043 {
14044 /* Fast path */
14045 if (formatfloat(v, arg, NULL, writer) == -1)
14046 return -1;
14047 return 1;
14048 }
14049
14050 arg->sign = 1;
14051 if (formatfloat(v, arg, p_str, NULL) == -1)
14052 return -1;
14053 break;
14054
14055 case 'c':
14056 {
14057 Py_UCS4 ch = formatchar(v);
14058 if (ch == (Py_UCS4) -1)
14059 return -1;
14060 if (arg->width == -1 && arg->prec == -1) {
14061 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014062 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014063 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014064 return 1;
14065 }
14066 *p_str = PyUnicode_FromOrdinal(ch);
14067 break;
14068 }
14069
14070 default:
14071 PyErr_Format(PyExc_ValueError,
14072 "unsupported format character '%c' (0x%x) "
14073 "at index %zd",
14074 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14075 (int)arg->ch,
14076 ctx->fmtpos - 1);
14077 return -1;
14078 }
14079 if (*p_str == NULL)
14080 return -1;
14081 assert (PyUnicode_Check(*p_str));
14082 return 0;
14083}
14084
14085static int
14086unicode_format_arg_output(struct unicode_formatter_t *ctx,
14087 struct unicode_format_arg_t *arg,
14088 PyObject *str)
14089{
14090 Py_ssize_t len;
14091 enum PyUnicode_Kind kind;
14092 void *pbuf;
14093 Py_ssize_t pindex;
14094 Py_UCS4 signchar;
14095 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014096 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014097 Py_ssize_t sublen;
14098 _PyUnicodeWriter *writer = &ctx->writer;
14099 Py_UCS4 fill;
14100
14101 fill = ' ';
14102 if (arg->sign && arg->flags & F_ZERO)
14103 fill = '0';
14104
14105 if (PyUnicode_READY(str) == -1)
14106 return -1;
14107
14108 len = PyUnicode_GET_LENGTH(str);
14109 if ((arg->width == -1 || arg->width <= len)
14110 && (arg->prec == -1 || arg->prec >= len)
14111 && !(arg->flags & (F_SIGN | F_BLANK)))
14112 {
14113 /* Fast path */
14114 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14115 return -1;
14116 return 0;
14117 }
14118
14119 /* Truncate the string for "s", "r" and "a" formats
14120 if the precision is set */
14121 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14122 if (arg->prec >= 0 && len > arg->prec)
14123 len = arg->prec;
14124 }
14125
14126 /* Adjust sign and width */
14127 kind = PyUnicode_KIND(str);
14128 pbuf = PyUnicode_DATA(str);
14129 pindex = 0;
14130 signchar = '\0';
14131 if (arg->sign) {
14132 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14133 if (ch == '-' || ch == '+') {
14134 signchar = ch;
14135 len--;
14136 pindex++;
14137 }
14138 else if (arg->flags & F_SIGN)
14139 signchar = '+';
14140 else if (arg->flags & F_BLANK)
14141 signchar = ' ';
14142 else
14143 arg->sign = 0;
14144 }
14145 if (arg->width < len)
14146 arg->width = len;
14147
14148 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014149 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014150 if (!(arg->flags & F_LJUST)) {
14151 if (arg->sign) {
14152 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014153 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014154 }
14155 else {
14156 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014157 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014158 }
14159 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014160 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14161 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014162 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014163 }
14164
Victor Stinnera47082312012-10-04 02:19:54 +020014165 buflen = arg->width;
14166 if (arg->sign && len == arg->width)
14167 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014168 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014169 return -1;
14170
14171 /* Write the sign if needed */
14172 if (arg->sign) {
14173 if (fill != ' ') {
14174 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14175 writer->pos += 1;
14176 }
14177 if (arg->width > len)
14178 arg->width--;
14179 }
14180
14181 /* Write the numeric prefix for "x", "X" and "o" formats
14182 if the alternate form is used.
14183 For example, write "0x" for the "%#x" format. */
14184 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14185 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14186 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14187 if (fill != ' ') {
14188 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14189 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14190 writer->pos += 2;
14191 pindex += 2;
14192 }
14193 arg->width -= 2;
14194 if (arg->width < 0)
14195 arg->width = 0;
14196 len -= 2;
14197 }
14198
14199 /* Pad left with the fill character if needed */
14200 if (arg->width > len && !(arg->flags & F_LJUST)) {
14201 sublen = arg->width - len;
14202 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14203 writer->pos += sublen;
14204 arg->width = len;
14205 }
14206
14207 /* If padding with spaces: write sign if needed and/or numeric prefix if
14208 the alternate form is used */
14209 if (fill == ' ') {
14210 if (arg->sign) {
14211 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14212 writer->pos += 1;
14213 }
14214 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14215 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14216 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14217 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14218 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14219 writer->pos += 2;
14220 pindex += 2;
14221 }
14222 }
14223
14224 /* Write characters */
14225 if (len) {
14226 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14227 str, pindex, len);
14228 writer->pos += len;
14229 }
14230
14231 /* Pad right with the fill character if needed */
14232 if (arg->width > len) {
14233 sublen = arg->width - len;
14234 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14235 writer->pos += sublen;
14236 }
14237 return 0;
14238}
14239
14240/* Helper of PyUnicode_Format(): format one arg.
14241 Return 0 on success, raise an exception and return -1 on error. */
14242static int
14243unicode_format_arg(struct unicode_formatter_t *ctx)
14244{
14245 struct unicode_format_arg_t arg;
14246 PyObject *str;
14247 int ret;
14248
Victor Stinner8dbd4212012-12-04 09:30:24 +010014249 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14250 arg.flags = 0;
14251 arg.width = -1;
14252 arg.prec = -1;
14253 arg.sign = 0;
14254 str = NULL;
14255
Victor Stinnera47082312012-10-04 02:19:54 +020014256 ret = unicode_format_arg_parse(ctx, &arg);
14257 if (ret == -1)
14258 return -1;
14259
14260 ret = unicode_format_arg_format(ctx, &arg, &str);
14261 if (ret == -1)
14262 return -1;
14263
14264 if (ret != 1) {
14265 ret = unicode_format_arg_output(ctx, &arg, str);
14266 Py_DECREF(str);
14267 if (ret == -1)
14268 return -1;
14269 }
14270
14271 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14272 PyErr_SetString(PyExc_TypeError,
14273 "not all arguments converted during string formatting");
14274 return -1;
14275 }
14276 return 0;
14277}
14278
Alexander Belopolsky40018472011-02-26 01:02:56 +000014279PyObject *
14280PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014281{
Victor Stinnera47082312012-10-04 02:19:54 +020014282 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014283
Guido van Rossumd57fd912000-03-10 22:53:23 +000014284 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014285 PyErr_BadInternalCall();
14286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014287 }
Victor Stinnera47082312012-10-04 02:19:54 +020014288
14289 ctx.fmtstr = PyUnicode_FromObject(format);
14290 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014291 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014292 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14293 Py_DECREF(ctx.fmtstr);
14294 return NULL;
14295 }
14296 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14297 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14298 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14299 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014300
Victor Stinner8f674cc2013-04-17 23:02:17 +020014301 _PyUnicodeWriter_Init(&ctx.writer);
14302 ctx.writer.min_length = ctx.fmtcnt + 100;
14303 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014304
Guido van Rossumd57fd912000-03-10 22:53:23 +000014305 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014306 ctx.arglen = PyTuple_Size(args);
14307 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014308 }
14309 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014310 ctx.arglen = -1;
14311 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014312 }
Victor Stinnera47082312012-10-04 02:19:54 +020014313 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014314 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014315 ctx.dict = args;
14316 else
14317 ctx.dict = NULL;
14318 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014319
Victor Stinnera47082312012-10-04 02:19:54 +020014320 while (--ctx.fmtcnt >= 0) {
14321 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014322 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014323
14324 nonfmtpos = ctx.fmtpos++;
14325 while (ctx.fmtcnt >= 0 &&
14326 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14327 ctx.fmtpos++;
14328 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014329 }
Victor Stinnera47082312012-10-04 02:19:54 +020014330 if (ctx.fmtcnt < 0) {
14331 ctx.fmtpos--;
14332 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014333 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014334
Victor Stinnercfc4c132013-04-03 01:48:39 +020014335 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14336 nonfmtpos, ctx.fmtpos) < 0)
14337 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014338 }
14339 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014340 ctx.fmtpos++;
14341 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014342 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014343 }
14344 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014345
Victor Stinnera47082312012-10-04 02:19:54 +020014346 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014347 PyErr_SetString(PyExc_TypeError,
14348 "not all arguments converted during string formatting");
14349 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014350 }
14351
Victor Stinnera47082312012-10-04 02:19:54 +020014352 if (ctx.args_owned) {
14353 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014354 }
Victor Stinnera47082312012-10-04 02:19:54 +020014355 Py_DECREF(ctx.fmtstr);
14356 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014357
Benjamin Peterson29060642009-01-31 22:14:21 +000014358 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014359 Py_DECREF(ctx.fmtstr);
14360 _PyUnicodeWriter_Dealloc(&ctx.writer);
14361 if (ctx.args_owned) {
14362 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014363 }
14364 return NULL;
14365}
14366
Jeremy Hylton938ace62002-07-17 16:30:39 +000014367static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014368unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14369
Tim Peters6d6c1a32001-08-02 04:15:00 +000014370static PyObject *
14371unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14372{
Benjamin Peterson29060642009-01-31 22:14:21 +000014373 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 static char *kwlist[] = {"object", "encoding", "errors", 0};
14375 char *encoding = NULL;
14376 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014377
Benjamin Peterson14339b62009-01-31 16:36:08 +000014378 if (type != &PyUnicode_Type)
14379 return unicode_subtype_new(type, args, kwds);
14380 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014381 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014382 return NULL;
14383 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014384 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014385 if (encoding == NULL && errors == NULL)
14386 return PyObject_Str(x);
14387 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014388 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014389}
14390
Guido van Rossume023fe02001-08-30 03:12:59 +000014391static PyObject *
14392unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14393{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014394 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014395 Py_ssize_t length, char_size;
14396 int share_wstr, share_utf8;
14397 unsigned int kind;
14398 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014399
Benjamin Peterson14339b62009-01-31 16:36:08 +000014400 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014401
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014402 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014403 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014404 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014405 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014406 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014407 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014408 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014409 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014410
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014411 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014412 if (self == NULL) {
14413 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014414 return NULL;
14415 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014416 kind = PyUnicode_KIND(unicode);
14417 length = PyUnicode_GET_LENGTH(unicode);
14418
14419 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014420#ifdef Py_DEBUG
14421 _PyUnicode_HASH(self) = -1;
14422#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014423 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014424#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014425 _PyUnicode_STATE(self).interned = 0;
14426 _PyUnicode_STATE(self).kind = kind;
14427 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014428 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014429 _PyUnicode_STATE(self).ready = 1;
14430 _PyUnicode_WSTR(self) = NULL;
14431 _PyUnicode_UTF8_LENGTH(self) = 0;
14432 _PyUnicode_UTF8(self) = NULL;
14433 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014434 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014435
14436 share_utf8 = 0;
14437 share_wstr = 0;
14438 if (kind == PyUnicode_1BYTE_KIND) {
14439 char_size = 1;
14440 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14441 share_utf8 = 1;
14442 }
14443 else if (kind == PyUnicode_2BYTE_KIND) {
14444 char_size = 2;
14445 if (sizeof(wchar_t) == 2)
14446 share_wstr = 1;
14447 }
14448 else {
14449 assert(kind == PyUnicode_4BYTE_KIND);
14450 char_size = 4;
14451 if (sizeof(wchar_t) == 4)
14452 share_wstr = 1;
14453 }
14454
14455 /* Ensure we won't overflow the length. */
14456 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14457 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014458 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014459 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014460 data = PyObject_MALLOC((length + 1) * char_size);
14461 if (data == NULL) {
14462 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014463 goto onError;
14464 }
14465
Victor Stinnerc3c74152011-10-02 20:39:55 +020014466 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014467 if (share_utf8) {
14468 _PyUnicode_UTF8_LENGTH(self) = length;
14469 _PyUnicode_UTF8(self) = data;
14470 }
14471 if (share_wstr) {
14472 _PyUnicode_WSTR_LENGTH(self) = length;
14473 _PyUnicode_WSTR(self) = (wchar_t *)data;
14474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014475
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014476 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014477 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014478 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014479#ifdef Py_DEBUG
14480 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14481#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014482 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014483 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014484
14485onError:
14486 Py_DECREF(unicode);
14487 Py_DECREF(self);
14488 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014489}
14490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014491PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014492"str(object='') -> str\n\
14493str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014494\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014495Create a new string object from the given object. If encoding or\n\
14496errors is specified, then the object must expose a data buffer\n\
14497that will be decoded using the given encoding and error handler.\n\
14498Otherwise, returns the result of object.__str__() (if defined)\n\
14499or repr(object).\n\
14500encoding defaults to sys.getdefaultencoding().\n\
14501errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014502
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014503static PyObject *unicode_iter(PyObject *seq);
14504
Guido van Rossumd57fd912000-03-10 22:53:23 +000014505PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014506 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014507 "str", /* tp_name */
14508 sizeof(PyUnicodeObject), /* tp_size */
14509 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014510 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014511 (destructor)unicode_dealloc, /* tp_dealloc */
14512 0, /* tp_print */
14513 0, /* tp_getattr */
14514 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014515 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014516 unicode_repr, /* tp_repr */
14517 &unicode_as_number, /* tp_as_number */
14518 &unicode_as_sequence, /* tp_as_sequence */
14519 &unicode_as_mapping, /* tp_as_mapping */
14520 (hashfunc) unicode_hash, /* tp_hash*/
14521 0, /* tp_call*/
14522 (reprfunc) unicode_str, /* tp_str */
14523 PyObject_GenericGetAttr, /* tp_getattro */
14524 0, /* tp_setattro */
14525 0, /* tp_as_buffer */
14526 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014527 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014528 unicode_doc, /* tp_doc */
14529 0, /* tp_traverse */
14530 0, /* tp_clear */
14531 PyUnicode_RichCompare, /* tp_richcompare */
14532 0, /* tp_weaklistoffset */
14533 unicode_iter, /* tp_iter */
14534 0, /* tp_iternext */
14535 unicode_methods, /* tp_methods */
14536 0, /* tp_members */
14537 0, /* tp_getset */
14538 &PyBaseObject_Type, /* tp_base */
14539 0, /* tp_dict */
14540 0, /* tp_descr_get */
14541 0, /* tp_descr_set */
14542 0, /* tp_dictoffset */
14543 0, /* tp_init */
14544 0, /* tp_alloc */
14545 unicode_new, /* tp_new */
14546 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014547};
14548
14549/* Initialize the Unicode implementation */
14550
Victor Stinner3a50e702011-10-18 21:21:00 +020014551int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014552{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014553 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014554 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014555 0x000A, /* LINE FEED */
14556 0x000D, /* CARRIAGE RETURN */
14557 0x001C, /* FILE SEPARATOR */
14558 0x001D, /* GROUP SEPARATOR */
14559 0x001E, /* RECORD SEPARATOR */
14560 0x0085, /* NEXT LINE */
14561 0x2028, /* LINE SEPARATOR */
14562 0x2029, /* PARAGRAPH SEPARATOR */
14563 };
14564
Fred Drakee4315f52000-05-09 19:53:39 +000014565 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014566 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014567 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014568 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014569 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014570
Guido van Rossumcacfc072002-05-24 19:01:59 +000014571 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014572 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014573
14574 /* initialize the linebreak bloom filter */
14575 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014576 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014577 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014578
Christian Heimes26532f72013-07-20 14:57:16 +020014579 if (PyType_Ready(&EncodingMapType) < 0)
14580 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014581
Benjamin Petersonc4311282012-10-30 23:21:10 -040014582 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14583 Py_FatalError("Can't initialize field name iterator type");
14584
14585 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14586 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014587
Victor Stinner3a50e702011-10-18 21:21:00 +020014588#ifdef HAVE_MBCS
14589 winver.dwOSVersionInfoSize = sizeof(winver);
14590 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14591 PyErr_SetFromWindowsErr(0);
14592 return -1;
14593 }
14594#endif
14595 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014596}
14597
14598/* Finalize the Unicode implementation */
14599
Christian Heimesa156e092008-02-16 07:38:31 +000014600int
14601PyUnicode_ClearFreeList(void)
14602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014603 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014604}
14605
Guido van Rossumd57fd912000-03-10 22:53:23 +000014606void
Thomas Wouters78890102000-07-22 19:25:51 +000014607_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014608{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014609 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014610
Serhiy Storchaka05997252013-01-26 12:14:02 +020014611 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014612
Serhiy Storchaka05997252013-01-26 12:14:02 +020014613 for (i = 0; i < 256; i++)
14614 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014615 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014616 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014617}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014618
Walter Dörwald16807132007-05-25 13:52:07 +000014619void
14620PyUnicode_InternInPlace(PyObject **p)
14621{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014622 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014623 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014624#ifdef Py_DEBUG
14625 assert(s != NULL);
14626 assert(_PyUnicode_CHECK(s));
14627#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014628 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014629 return;
14630#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014631 /* If it's a subclass, we don't really know what putting
14632 it in the interned dict might do. */
14633 if (!PyUnicode_CheckExact(s))
14634 return;
14635 if (PyUnicode_CHECK_INTERNED(s))
14636 return;
14637 if (interned == NULL) {
14638 interned = PyDict_New();
14639 if (interned == NULL) {
14640 PyErr_Clear(); /* Don't leave an exception */
14641 return;
14642 }
14643 }
14644 /* It might be that the GetItem call fails even
14645 though the key is present in the dictionary,
14646 namely when this happens during a stack overflow. */
14647 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014648 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014649 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014650
Victor Stinnerf0335102013-04-14 19:13:03 +020014651 if (t) {
14652 Py_INCREF(t);
14653 Py_DECREF(*p);
14654 *p = t;
14655 return;
14656 }
Walter Dörwald16807132007-05-25 13:52:07 +000014657
Benjamin Peterson14339b62009-01-31 16:36:08 +000014658 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014659 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014660 PyErr_Clear();
14661 PyThreadState_GET()->recursion_critical = 0;
14662 return;
14663 }
14664 PyThreadState_GET()->recursion_critical = 0;
14665 /* The two references in interned are not counted by refcnt.
14666 The deallocator will take care of this */
14667 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014668 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014669}
14670
14671void
14672PyUnicode_InternImmortal(PyObject **p)
14673{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014674 PyUnicode_InternInPlace(p);
14675 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014676 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014677 Py_INCREF(*p);
14678 }
Walter Dörwald16807132007-05-25 13:52:07 +000014679}
14680
14681PyObject *
14682PyUnicode_InternFromString(const char *cp)
14683{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014684 PyObject *s = PyUnicode_FromString(cp);
14685 if (s == NULL)
14686 return NULL;
14687 PyUnicode_InternInPlace(&s);
14688 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014689}
14690
Alexander Belopolsky40018472011-02-26 01:02:56 +000014691void
14692_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014693{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014694 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014695 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014696 Py_ssize_t i, n;
14697 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014698
Benjamin Peterson14339b62009-01-31 16:36:08 +000014699 if (interned == NULL || !PyDict_Check(interned))
14700 return;
14701 keys = PyDict_Keys(interned);
14702 if (keys == NULL || !PyList_Check(keys)) {
14703 PyErr_Clear();
14704 return;
14705 }
Walter Dörwald16807132007-05-25 13:52:07 +000014706
Benjamin Peterson14339b62009-01-31 16:36:08 +000014707 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14708 detector, interned unicode strings are not forcibly deallocated;
14709 rather, we give them their stolen references back, and then clear
14710 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014711
Benjamin Peterson14339b62009-01-31 16:36:08 +000014712 n = PyList_GET_SIZE(keys);
14713 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014714 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014715 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014716 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014717 if (PyUnicode_READY(s) == -1) {
14718 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014719 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014720 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014721 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014722 case SSTATE_NOT_INTERNED:
14723 /* XXX Shouldn't happen */
14724 break;
14725 case SSTATE_INTERNED_IMMORTAL:
14726 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014727 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014728 break;
14729 case SSTATE_INTERNED_MORTAL:
14730 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014731 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014732 break;
14733 default:
14734 Py_FatalError("Inconsistent interned string state.");
14735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014736 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014737 }
14738 fprintf(stderr, "total size of all interned strings: "
14739 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14740 "mortal/immortal\n", mortal_size, immortal_size);
14741 Py_DECREF(keys);
14742 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014743 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014744}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014745
14746
14747/********************* Unicode Iterator **************************/
14748
14749typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014750 PyObject_HEAD
14751 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014752 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014753} unicodeiterobject;
14754
14755static void
14756unicodeiter_dealloc(unicodeiterobject *it)
14757{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014758 _PyObject_GC_UNTRACK(it);
14759 Py_XDECREF(it->it_seq);
14760 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014761}
14762
14763static int
14764unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14765{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014766 Py_VISIT(it->it_seq);
14767 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014768}
14769
14770static PyObject *
14771unicodeiter_next(unicodeiterobject *it)
14772{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014773 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014774
Benjamin Peterson14339b62009-01-31 16:36:08 +000014775 assert(it != NULL);
14776 seq = it->it_seq;
14777 if (seq == NULL)
14778 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014779 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014781 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14782 int kind = PyUnicode_KIND(seq);
14783 void *data = PyUnicode_DATA(seq);
14784 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14785 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014786 if (item != NULL)
14787 ++it->it_index;
14788 return item;
14789 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014790
Benjamin Peterson14339b62009-01-31 16:36:08 +000014791 Py_DECREF(seq);
14792 it->it_seq = NULL;
14793 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014794}
14795
14796static PyObject *
14797unicodeiter_len(unicodeiterobject *it)
14798{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014799 Py_ssize_t len = 0;
14800 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014801 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014802 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014803}
14804
14805PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14806
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014807static PyObject *
14808unicodeiter_reduce(unicodeiterobject *it)
14809{
14810 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014811 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014812 it->it_seq, it->it_index);
14813 } else {
14814 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14815 if (u == NULL)
14816 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014817 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014818 }
14819}
14820
14821PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14822
14823static PyObject *
14824unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14825{
14826 Py_ssize_t index = PyLong_AsSsize_t(state);
14827 if (index == -1 && PyErr_Occurred())
14828 return NULL;
14829 if (index < 0)
14830 index = 0;
14831 it->it_index = index;
14832 Py_RETURN_NONE;
14833}
14834
14835PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14836
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014837static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014838 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014839 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014840 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14841 reduce_doc},
14842 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14843 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014844 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014845};
14846
14847PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014848 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14849 "str_iterator", /* tp_name */
14850 sizeof(unicodeiterobject), /* tp_basicsize */
14851 0, /* tp_itemsize */
14852 /* methods */
14853 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14854 0, /* tp_print */
14855 0, /* tp_getattr */
14856 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014857 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014858 0, /* tp_repr */
14859 0, /* tp_as_number */
14860 0, /* tp_as_sequence */
14861 0, /* tp_as_mapping */
14862 0, /* tp_hash */
14863 0, /* tp_call */
14864 0, /* tp_str */
14865 PyObject_GenericGetAttr, /* tp_getattro */
14866 0, /* tp_setattro */
14867 0, /* tp_as_buffer */
14868 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14869 0, /* tp_doc */
14870 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14871 0, /* tp_clear */
14872 0, /* tp_richcompare */
14873 0, /* tp_weaklistoffset */
14874 PyObject_SelfIter, /* tp_iter */
14875 (iternextfunc)unicodeiter_next, /* tp_iternext */
14876 unicodeiter_methods, /* tp_methods */
14877 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014878};
14879
14880static PyObject *
14881unicode_iter(PyObject *seq)
14882{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014883 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014884
Benjamin Peterson14339b62009-01-31 16:36:08 +000014885 if (!PyUnicode_Check(seq)) {
14886 PyErr_BadInternalCall();
14887 return NULL;
14888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014889 if (PyUnicode_READY(seq) == -1)
14890 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014891 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14892 if (it == NULL)
14893 return NULL;
14894 it->it_index = 0;
14895 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014896 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014897 _PyObject_GC_TRACK(it);
14898 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014899}
14900
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014901
14902size_t
14903Py_UNICODE_strlen(const Py_UNICODE *u)
14904{
14905 int res = 0;
14906 while(*u++)
14907 res++;
14908 return res;
14909}
14910
14911Py_UNICODE*
14912Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14913{
14914 Py_UNICODE *u = s1;
14915 while ((*u++ = *s2++));
14916 return s1;
14917}
14918
14919Py_UNICODE*
14920Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14921{
14922 Py_UNICODE *u = s1;
14923 while ((*u++ = *s2++))
14924 if (n-- == 0)
14925 break;
14926 return s1;
14927}
14928
14929Py_UNICODE*
14930Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14931{
14932 Py_UNICODE *u1 = s1;
14933 u1 += Py_UNICODE_strlen(u1);
14934 Py_UNICODE_strcpy(u1, s2);
14935 return s1;
14936}
14937
14938int
14939Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14940{
14941 while (*s1 && *s2 && *s1 == *s2)
14942 s1++, s2++;
14943 if (*s1 && *s2)
14944 return (*s1 < *s2) ? -1 : +1;
14945 if (*s1)
14946 return 1;
14947 if (*s2)
14948 return -1;
14949 return 0;
14950}
14951
14952int
14953Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14954{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014955 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014956 for (; n != 0; n--) {
14957 u1 = *s1;
14958 u2 = *s2;
14959 if (u1 != u2)
14960 return (u1 < u2) ? -1 : +1;
14961 if (u1 == '\0')
14962 return 0;
14963 s1++;
14964 s2++;
14965 }
14966 return 0;
14967}
14968
14969Py_UNICODE*
14970Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14971{
14972 const Py_UNICODE *p;
14973 for (p = s; *p; p++)
14974 if (*p == c)
14975 return (Py_UNICODE*)p;
14976 return NULL;
14977}
14978
14979Py_UNICODE*
14980Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14981{
14982 const Py_UNICODE *p;
14983 p = s + Py_UNICODE_strlen(s);
14984 while (p != s) {
14985 p--;
14986 if (*p == c)
14987 return (Py_UNICODE*)p;
14988 }
14989 return NULL;
14990}
Victor Stinner331ea922010-08-10 16:37:20 +000014991
Victor Stinner71133ff2010-09-01 23:43:53 +000014992Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014993PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014994{
Victor Stinner577db2c2011-10-11 22:12:48 +020014995 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014996 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014998 if (!PyUnicode_Check(unicode)) {
14999 PyErr_BadArgument();
15000 return NULL;
15001 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015002 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015003 if (u == NULL)
15004 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015005 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015006 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015007 PyErr_NoMemory();
15008 return NULL;
15009 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015010 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015011 size *= sizeof(Py_UNICODE);
15012 copy = PyMem_Malloc(size);
15013 if (copy == NULL) {
15014 PyErr_NoMemory();
15015 return NULL;
15016 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015017 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015018 return copy;
15019}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015020
Georg Brandl66c221e2010-10-14 07:04:07 +000015021/* A _string module, to export formatter_parser and formatter_field_name_split
15022 to the string.Formatter class implemented in Python. */
15023
15024static PyMethodDef _string_methods[] = {
15025 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15026 METH_O, PyDoc_STR("split the argument as a field name")},
15027 {"formatter_parser", (PyCFunction) formatter_parser,
15028 METH_O, PyDoc_STR("parse the argument as a format string")},
15029 {NULL, NULL}
15030};
15031
15032static struct PyModuleDef _string_module = {
15033 PyModuleDef_HEAD_INIT,
15034 "_string",
15035 PyDoc_STR("string helper module"),
15036 0,
15037 _string_methods,
15038 NULL,
15039 NULL,
15040 NULL,
15041 NULL
15042};
15043
15044PyMODINIT_FUNC
15045PyInit__string(void)
15046{
15047 return PyModule_Create(&_string_module);
15048}
15049
15050
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015051#ifdef __cplusplus
15052}
15053#endif