blob: b2f488d1f25980546dbd63ebdbfabc913737394c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinner910337b2011-10-03 03:20:16 +0200107#undef PyUnicode_READY
108#define PyUnicode_READY(op) \
109 (assert(_PyUnicode_CHECK(op)), \
110 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200111 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100112 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200113
Victor Stinnerc379ead2011-10-03 12:52:27 +0200114#define _PyUnicode_SHARE_UTF8(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
117 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
118#define _PyUnicode_SHARE_WSTR(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
121
Victor Stinner829c0ad2011-10-03 01:08:02 +0200122/* true if the Unicode object has an allocated UTF-8 memory block
123 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200125 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200126 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
128
Victor Stinner03490912011-10-03 23:45:12 +0200129/* true if the Unicode object has an allocated wstr memory block
130 (not shared with other data) */
131#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200132 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200133 (!PyUnicode_IS_READY(op) || \
134 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
135
Victor Stinner910337b2011-10-03 03:20:16 +0200136/* Generic helper macro to convert characters of different types.
137 from_type and to_type have to be valid type names, begin and end
138 are pointers to the source characters which should be of type
139 "from_type *". to is a pointer of type "to_type *" and points to the
140 buffer where the result characters are written to. */
141#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
142 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200143 to_type *_to = (to_type *) to; \
144 const from_type *_iter = (begin); \
145 const from_type *_end = (end); \
146 Py_ssize_t n = (_end) - (_iter); \
147 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200148 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 while (_iter < (_unrolled_end)) { \
150 _to[0] = (to_type) _iter[0]; \
151 _to[1] = (to_type) _iter[1]; \
152 _to[2] = (to_type) _iter[2]; \
153 _to[3] = (to_type) _iter[3]; \
154 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200155 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_end)) \
157 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200158 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159
Walter Dörwald16807132007-05-25 13:52:07 +0000160/* This dictionary holds all interned unicode strings. Note that references
161 to strings in this dictionary are *not* counted in the string's ob_refcnt.
162 When the interned string reaches a refcnt of 0 the string deallocation
163 function will delete the reference from this dictionary.
164
165 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000166 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000167*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200168static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000169
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000170/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200171static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200172
Serhiy Storchaka678db842013-01-26 12:16:36 +0200173#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200174 do { \
175 if (unicode_empty != NULL) \
176 Py_INCREF(unicode_empty); \
177 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178 unicode_empty = PyUnicode_New(0, 0); \
179 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200180 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
182 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186#define _Py_RETURN_UNICODE_EMPTY() \
187 do { \
188 _Py_INCREF_UNICODE_EMPTY(); \
189 return unicode_empty; \
190 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200192/* Forward declaration */
193Py_LOCAL_INLINE(int)
194_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
195
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200196/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200197static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* Single character Unicode strings in the Latin-1 range are being
200 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202
Christian Heimes190d79e2008-01-30 11:58:22 +0000203/* Fast detection of the most frequent whitespace characters */
204const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000206/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000208/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x000C: * FORM FEED */
210/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 1, 1, 1, 1, 1, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x001C: * FILE SEPARATOR */
214/* case 0x001D: * GROUP SEPARATOR */
215/* case 0x001E: * RECORD SEPARATOR */
216/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 1, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000223
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000232};
233
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200234/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200235static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200236static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100237static int unicode_modifiable(PyObject *unicode);
238
Victor Stinnerfe226c02011-10-03 03:52:20 +0200239
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100241_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200242static PyObject *
243_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
244static PyObject *
245_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
246
247static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000248unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000249 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100250 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000251 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
252
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253static void
254raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300255 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100256 PyObject *unicode,
257 Py_ssize_t startpos, Py_ssize_t endpos,
258 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000259
Christian Heimes190d79e2008-01-30 11:58:22 +0000260/* Same for linebreaks */
261static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000262 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264/* 0x000B, * LINE TABULATION */
265/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x001C, * FILE SEPARATOR */
270/* 0x001D, * GROUP SEPARATOR */
271/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000272 0, 0, 0, 0, 1, 1, 1, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000277
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000286};
287
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300288/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
289 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000291PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000293#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 /* This is actually an illegal character, so it should
297 not be passed to unichr. */
298 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299#endif
300}
301
Victor Stinner910337b2011-10-03 03:20:16 +0200302#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200303int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100304_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200305{
306 PyASCIIObject *ascii;
307 unsigned int kind;
308
309 assert(PyUnicode_Check(op));
310
311 ascii = (PyASCIIObject *)op;
312 kind = ascii->state.kind;
313
Victor Stinnera3b334d2011-10-03 13:53:37 +0200314 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(ascii->state.ready == 1);
317 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200319 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200320 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200321
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 if (ascii->state.compact == 1) {
323 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200324 assert(kind == PyUnicode_1BYTE_KIND
325 || kind == PyUnicode_2BYTE_KIND
326 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 }
331 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
333
334 data = unicode->data.any;
335 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 assert(ascii->length == 0);
337 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.compact == 0);
339 assert(ascii->state.ascii == 0);
340 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 }
346 else {
347 assert(kind == PyUnicode_1BYTE_KIND
348 || kind == PyUnicode_2BYTE_KIND
349 || kind == PyUnicode_4BYTE_KIND);
350 assert(ascii->state.compact == 0);
351 assert(ascii->state.ready == 1);
352 assert(data != NULL);
353 if (ascii->state.ascii) {
354 assert (compact->utf8 == data);
355 assert (compact->utf8_length == ascii->length);
356 }
357 else
358 assert (compact->utf8 != data);
359 }
360 }
361 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200362 if (
363#if SIZEOF_WCHAR_T == 2
364 kind == PyUnicode_2BYTE_KIND
365#else
366 kind == PyUnicode_4BYTE_KIND
367#endif
368 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200369 {
370 assert(ascii->wstr == data);
371 assert(compact->wstr_length == ascii->length);
372 } else
373 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200375
376 if (compact->utf8 == NULL)
377 assert(compact->utf8_length == 0);
378 if (ascii->wstr == NULL)
379 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 /* check that the best kind is used */
382 if (check_content && kind != PyUnicode_WCHAR_KIND)
383 {
384 Py_ssize_t i;
385 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200386 void *data;
387 Py_UCS4 ch;
388
389 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 for (i=0; i < ascii->length; i++)
391 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 if (ch > maxchar)
394 maxchar = ch;
395 }
396 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100397 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 assert(maxchar <= 255);
400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 else
402 assert(maxchar < 128);
403 }
Victor Stinner77faf692011-11-20 18:56:05 +0100404 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 0xFFFF);
407 }
408 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100410 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200412 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200413 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400414 return 1;
415}
Victor Stinner910337b2011-10-03 03:20:16 +0200416#endif
417
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100418static PyObject*
419unicode_result_wchar(PyObject *unicode)
420{
421#ifndef Py_DEBUG
422 Py_ssize_t len;
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100426 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200427 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100432 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200440 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 return NULL;
442 }
443#else
Victor Stinneraa771272012-10-04 02:32:58 +0200444 assert(Py_REFCNT(unicode) == 1);
445
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 /* don't make the result ready in debug mode to ensure that the caller
447 makes the string ready before using it */
448 assert(_PyUnicode_CheckConsistency(unicode, 1));
449#endif
450 return unicode;
451}
452
453static PyObject*
454unicode_result_ready(PyObject *unicode)
455{
456 Py_ssize_t length;
457
458 length = PyUnicode_GET_LENGTH(unicode);
459 if (length == 0) {
460 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100461 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200462 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100463 }
464 return unicode_empty;
465 }
466
467 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200468 void *data = PyUnicode_DATA(unicode);
469 int kind = PyUnicode_KIND(unicode);
470 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100471 if (ch < 256) {
472 PyObject *latin1_char = unicode_latin1[ch];
473 if (latin1_char != NULL) {
474 if (unicode != latin1_char) {
475 Py_INCREF(latin1_char);
476 Py_DECREF(unicode);
477 }
478 return latin1_char;
479 }
480 else {
481 assert(_PyUnicode_CheckConsistency(unicode, 1));
482 Py_INCREF(unicode);
483 unicode_latin1[ch] = unicode;
484 return unicode;
485 }
486 }
487 }
488
489 assert(_PyUnicode_CheckConsistency(unicode, 1));
490 return unicode;
491}
492
493static PyObject*
494unicode_result(PyObject *unicode)
495{
496 assert(_PyUnicode_CHECK(unicode));
497 if (PyUnicode_IS_READY(unicode))
498 return unicode_result_ready(unicode);
499 else
500 return unicode_result_wchar(unicode);
501}
502
Victor Stinnerc4b49542011-12-11 22:44:26 +0100503static PyObject*
504unicode_result_unchanged(PyObject *unicode)
505{
506 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500507 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508 return NULL;
509 Py_INCREF(unicode);
510 return unicode;
511 }
512 else
513 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100514 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515}
516
Victor Stinner3a50e702011-10-18 21:21:00 +0200517#ifdef HAVE_MBCS
518static OSVERSIONINFOEX winver;
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521/* --- Bloom Filters ----------------------------------------------------- */
522
523/* stuff to implement simple "bloom filters" for Unicode characters.
524 to keep things simple, we use a single bitmask, using the least 5
525 bits from each unicode characters as the bit index. */
526
527/* the linebreak mask is set up by Unicode_Init below */
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#if LONG_BIT >= 128
530#define BLOOM_WIDTH 128
531#elif LONG_BIT >= 64
532#define BLOOM_WIDTH 64
533#elif LONG_BIT >= 32
534#define BLOOM_WIDTH 32
535#else
536#error "LONG_BIT is smaller than 32"
537#endif
538
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539#define BLOOM_MASK unsigned long
540
Serhiy Storchaka05997252013-01-26 12:14:02 +0200541static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542
Antoine Pitrouf068f942010-01-13 14:19:12 +0000543#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
Benjamin Peterson29060642009-01-31 22:14:21 +0000545#define BLOOM_LINEBREAK(ch) \
546 ((ch) < 128U ? ascii_linebreak[(ch)] : \
547 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
Alexander Belopolsky40018472011-02-26 01:02:56 +0000549Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551{
Victor Stinnera85af502013-04-09 21:53:54 +0200552#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
553 do { \
554 TYPE *data = (TYPE *)PTR; \
555 TYPE *end = data + LEN; \
556 Py_UCS4 ch; \
557 for (; data != end; data++) { \
558 ch = *data; \
559 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
560 } \
561 break; \
562 } while (0)
563
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564 /* calculate simple bloom-style bitmask for a given unicode string */
565
Antoine Pitrouf068f942010-01-13 14:19:12 +0000566 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567
568 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200569 switch (kind) {
570 case PyUnicode_1BYTE_KIND:
571 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
572 break;
573 case PyUnicode_2BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
575 break;
576 case PyUnicode_4BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
578 break;
579 default:
580 assert(0);
581 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000582 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200583
584#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585}
586
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200587/* Compilation of templated routines */
588
589#include "stringlib/asciilib.h"
590#include "stringlib/fastsearch.h"
591#include "stringlib/partition.h"
592#include "stringlib/split.h"
593#include "stringlib/count.h"
594#include "stringlib/find.h"
595#include "stringlib/find_max_char.h"
596#include "stringlib/localeutil.h"
597#include "stringlib/undef.h"
598
599#include "stringlib/ucs1lib.h"
600#include "stringlib/fastsearch.h"
601#include "stringlib/partition.h"
602#include "stringlib/split.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300605#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs2lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300616#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200617#include "stringlib/find_max_char.h"
618#include "stringlib/localeutil.h"
619#include "stringlib/undef.h"
620
621#include "stringlib/ucs4lib.h"
622#include "stringlib/fastsearch.h"
623#include "stringlib/partition.h"
624#include "stringlib/split.h"
625#include "stringlib/count.h"
626#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300627#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200628#include "stringlib/find_max_char.h"
629#include "stringlib/localeutil.h"
630#include "stringlib/undef.h"
631
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200632#include "stringlib/unicodedefs.h"
633#include "stringlib/fastsearch.h"
634#include "stringlib/count.h"
635#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100636#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638/* --- Unicode Object ----------------------------------------------------- */
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200641fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200643Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
644 Py_ssize_t size, Py_UCS4 ch,
645 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
648
649 switch (kind) {
650 case PyUnicode_1BYTE_KIND:
651 {
652 Py_UCS1 ch1 = (Py_UCS1) ch;
653 if (ch1 == ch)
654 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
655 else
656 return -1;
657 }
658 case PyUnicode_2BYTE_KIND:
659 {
660 Py_UCS2 ch2 = (Py_UCS2) ch;
661 if (ch2 == ch)
662 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
663 else
664 return -1;
665 }
666 case PyUnicode_4BYTE_KIND:
667 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
668 default:
669 assert(0);
670 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672}
673
Victor Stinnerafffce42012-10-03 23:03:17 +0200674#ifdef Py_DEBUG
675/* Fill the data of an Unicode string with invalid characters to detect bugs
676 earlier.
677
678 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
679 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
680 invalid character in Unicode 6.0. */
681static void
682unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
683{
684 int kind = PyUnicode_KIND(unicode);
685 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
686 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
687 if (length <= old_length)
688 return;
689 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
690}
691#endif
692
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693static PyObject*
694resize_compact(PyObject *unicode, Py_ssize_t length)
695{
696 Py_ssize_t char_size;
697 Py_ssize_t struct_size;
698 Py_ssize_t new_size;
699 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100700 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
703#endif
704
Victor Stinner79891572012-05-03 13:43:07 +0200705 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100707 assert(PyUnicode_IS_COMPACT(unicode));
708
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200709 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100710 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 struct_size = sizeof(PyASCIIObject);
712 else
713 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
717 PyErr_NoMemory();
718 return NULL;
719 }
720 new_size = (struct_size + (length + 1) * char_size);
721
Victor Stinner84def372011-12-11 20:04:56 +0100722 _Py_DEC_REFTOTAL;
723 _Py_ForgetReference(unicode);
724
725 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
726 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100727 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200728 PyErr_NoMemory();
729 return NULL;
730 }
Victor Stinner84def372011-12-11 20:04:56 +0100731 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100733
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100737 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 _PyUnicode_WSTR_LENGTH(unicode) = length;
739 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100740 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
741 PyObject_DEL(_PyUnicode_WSTR(unicode));
742 _PyUnicode_WSTR(unicode) = NULL;
743 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200744#ifdef Py_DEBUG
745 unicode_fill_invalid(unicode, old_length);
746#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
748 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 return unicode;
751}
752
Alexander Belopolsky40018472011-02-26 01:02:56 +0000753static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200754resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755{
Victor Stinner95663112011-10-04 01:03:50 +0200756 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000760
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 if (PyUnicode_IS_READY(unicode)) {
762 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200765#ifdef Py_DEBUG
766 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
767#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200770 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200771 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
772 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
775 PyErr_NoMemory();
776 return -1;
777 }
778 new_size = (length + 1) * char_size;
779
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
781 {
782 PyObject_DEL(_PyUnicode_UTF8(unicode));
783 _PyUnicode_UTF8(unicode) = NULL;
784 _PyUnicode_UTF8_LENGTH(unicode) = 0;
785 }
786
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 data = (PyObject *)PyObject_REALLOC(data, new_size);
788 if (data == NULL) {
789 PyErr_NoMemory();
790 return -1;
791 }
792 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200793 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200795 _PyUnicode_WSTR_LENGTH(unicode) = length;
796 }
797 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200798 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200799 _PyUnicode_UTF8_LENGTH(unicode) = length;
800 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200801 _PyUnicode_LENGTH(unicode) = length;
802 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200803#ifdef Py_DEBUG
804 unicode_fill_invalid(unicode, old_length);
805#endif
Victor Stinner95663112011-10-04 01:03:50 +0200806 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200807 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200809 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200810 }
Victor Stinner95663112011-10-04 01:03:50 +0200811 assert(_PyUnicode_WSTR(unicode) != NULL);
812
813 /* check for integer overflow */
814 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
815 PyErr_NoMemory();
816 return -1;
817 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100818 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200819 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100820 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200821 if (!wstr) {
822 PyErr_NoMemory();
823 return -1;
824 }
825 _PyUnicode_WSTR(unicode) = wstr;
826 _PyUnicode_WSTR(unicode)[length] = 0;
827 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200828 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 return 0;
830}
831
Victor Stinnerfe226c02011-10-03 03:52:20 +0200832static PyObject*
833resize_copy(PyObject *unicode, Py_ssize_t length)
834{
835 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100836 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100838
Benjamin Petersonbac79492012-01-14 13:34:47 -0500839 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100840 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200841
842 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
843 if (copy == NULL)
844 return NULL;
845
846 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200847 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200848 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200849 }
850 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200851 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100852
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200853 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200854 if (w == NULL)
855 return NULL;
856 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
857 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200858 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
859 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200860 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200861 }
862}
863
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000865 Ux0000 terminated; some code (e.g. new_identifier)
866 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
868 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000869 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871*/
872
Alexander Belopolsky40018472011-02-26 01:02:56 +0000873static PyUnicodeObject *
874_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200876 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880 if (length == 0 && unicode_empty != NULL) {
881 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200882 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 }
884
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000885 /* Ensure we won't overflow the size. */
886 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
887 return (PyUnicodeObject *)PyErr_NoMemory();
888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 if (length < 0) {
890 PyErr_SetString(PyExc_SystemError,
891 "Negative size passed to _PyUnicode_New");
892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
896 if (unicode == NULL)
897 return NULL;
898 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100899
900 _PyUnicode_WSTR_LENGTH(unicode) = length;
901 _PyUnicode_HASH(unicode) = -1;
902 _PyUnicode_STATE(unicode).interned = 0;
903 _PyUnicode_STATE(unicode).kind = 0;
904 _PyUnicode_STATE(unicode).compact = 0;
905 _PyUnicode_STATE(unicode).ready = 0;
906 _PyUnicode_STATE(unicode).ascii = 0;
907 _PyUnicode_DATA_ANY(unicode) = NULL;
908 _PyUnicode_LENGTH(unicode) = 0;
909 _PyUnicode_UTF8(unicode) = NULL;
910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
913 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100914 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000915 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100916 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918
Jeremy Hyltond8082792003-09-16 19:41:39 +0000919 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000920 * the caller fails before initializing str -- unicode_resize()
921 * reads str[0], and the Keep-Alive optimization can keep memory
922 * allocated for str alive across a call to unicode_dealloc(unicode).
923 * We don't want unicode_resize to read uninitialized memory in
924 * that case.
925 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926 _PyUnicode_WSTR(unicode)[0] = 0;
927 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100928
Victor Stinner7931d9a2011-11-04 00:22:48 +0100929 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000930 return unicode;
931}
932
Victor Stinnerf42dc442011-10-02 23:33:16 +0200933static const char*
934unicode_kind_name(PyObject *unicode)
935{
Victor Stinner42dfd712011-10-03 14:41:45 +0200936 /* don't check consistency: unicode_kind_name() is called from
937 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938 if (!PyUnicode_IS_COMPACT(unicode))
939 {
940 if (!PyUnicode_IS_READY(unicode))
941 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600942 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 {
944 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200945 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946 return "legacy ascii";
947 else
948 return "legacy latin1";
949 case PyUnicode_2BYTE_KIND:
950 return "legacy UCS2";
951 case PyUnicode_4BYTE_KIND:
952 return "legacy UCS4";
953 default:
954 return "<legacy invalid kind>";
955 }
956 }
957 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600958 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200959 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200960 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200961 return "ascii";
962 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200967 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200968 default:
969 return "<invalid compact kind>";
970 }
971}
972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974/* Functions wrapping macros for use in debugger */
975char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200976 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977}
978
979void *_PyUnicode_compact_data(void *unicode) {
980 return _PyUnicode_COMPACT_DATA(unicode);
981}
982void *_PyUnicode_data(void *unicode){
983 printf("obj %p\n", unicode);
984 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
985 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
986 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
987 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
988 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
989 return PyUnicode_DATA(unicode);
990}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200991
992void
993_PyUnicode_Dump(PyObject *op)
994{
995 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200996 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
997 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
998 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200999
Victor Stinnera849a4b2011-10-03 12:12:11 +02001000 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001001 {
1002 if (ascii->state.ascii)
1003 data = (ascii + 1);
1004 else
1005 data = (compact + 1);
1006 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001007 else
1008 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001009 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1010
Victor Stinnera849a4b2011-10-03 12:12:11 +02001011 if (ascii->wstr == data)
1012 printf("shared ");
1013 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001014
Victor Stinnera3b334d2011-10-03 13:53:37 +02001015 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 printf(" (%zu), ", compact->wstr_length);
1017 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1018 printf("shared ");
1019 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001020 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001022}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023#endif
1024
1025PyObject *
1026PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1027{
1028 PyObject *obj;
1029 PyCompactUnicodeObject *unicode;
1030 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001031 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001032 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035
1036 /* Optimization for empty strings */
1037 if (size == 0 && unicode_empty != NULL) {
1038 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001039 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 }
1041
Victor Stinner9e9d6892011-10-04 01:02:02 +02001042 is_ascii = 0;
1043 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 struct_size = sizeof(PyCompactUnicodeObject);
1045 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001046 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 char_size = 1;
1048 is_ascii = 1;
1049 struct_size = sizeof(PyASCIIObject);
1050 }
1051 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001052 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 char_size = 1;
1054 }
1055 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001056 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 char_size = 2;
1058 if (sizeof(wchar_t) == 2)
1059 is_sharing = 1;
1060 }
1061 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001062 if (maxchar > MAX_UNICODE) {
1063 PyErr_SetString(PyExc_SystemError,
1064 "invalid maximum character passed to PyUnicode_New");
1065 return NULL;
1066 }
Victor Stinner8f825062012-04-27 13:55:39 +02001067 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 char_size = 4;
1069 if (sizeof(wchar_t) == 4)
1070 is_sharing = 1;
1071 }
1072
1073 /* Ensure we won't overflow the size. */
1074 if (size < 0) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "Negative size passed to PyUnicode_New");
1077 return NULL;
1078 }
1079 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1080 return PyErr_NoMemory();
1081
1082 /* Duplicated allocation code from _PyObject_New() instead of a call to
1083 * PyObject_New() so we are able to allocate space for the object and
1084 * it's data buffer.
1085 */
1086 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1087 if (obj == NULL)
1088 return PyErr_NoMemory();
1089 obj = PyObject_INIT(obj, &PyUnicode_Type);
1090 if (obj == NULL)
1091 return NULL;
1092
1093 unicode = (PyCompactUnicodeObject *)obj;
1094 if (is_ascii)
1095 data = ((PyASCIIObject*)obj) + 1;
1096 else
1097 data = unicode + 1;
1098 _PyUnicode_LENGTH(unicode) = size;
1099 _PyUnicode_HASH(unicode) = -1;
1100 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001101 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 _PyUnicode_STATE(unicode).compact = 1;
1103 _PyUnicode_STATE(unicode).ready = 1;
1104 _PyUnicode_STATE(unicode).ascii = is_ascii;
1105 if (is_ascii) {
1106 ((char*)data)[size] = 0;
1107 _PyUnicode_WSTR(unicode) = NULL;
1108 }
Victor Stinner8f825062012-04-27 13:55:39 +02001109 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 ((char*)data)[size] = 0;
1111 _PyUnicode_WSTR(unicode) = NULL;
1112 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001114 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 else {
1117 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001118 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001119 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001121 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 ((Py_UCS4*)data)[size] = 0;
1123 if (is_sharing) {
1124 _PyUnicode_WSTR_LENGTH(unicode) = size;
1125 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1126 }
1127 else {
1128 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1129 _PyUnicode_WSTR(unicode) = NULL;
1130 }
1131 }
Victor Stinner8f825062012-04-27 13:55:39 +02001132#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001133 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001134#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001135 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136 return obj;
1137}
1138
1139#if SIZEOF_WCHAR_T == 2
1140/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1141 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001142 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
1144 This function assumes that unicode can hold one more code point than wstr
1145 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001146static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001148 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149{
1150 const wchar_t *iter;
1151 Py_UCS4 *ucs4_out;
1152
Victor Stinner910337b2011-10-03 03:20:16 +02001153 assert(unicode != NULL);
1154 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1156 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1157
1158 for (iter = begin; iter < end; ) {
1159 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1160 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001161 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1162 && (iter+1) < end
1163 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 {
Victor Stinner551ac952011-11-29 22:58:13 +01001165 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 iter += 2;
1167 }
1168 else {
1169 *ucs4_out++ = *iter;
1170 iter++;
1171 }
1172 }
1173 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1174 _PyUnicode_GET_LENGTH(unicode)));
1175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176}
1177#endif
1178
Victor Stinnercd9950f2011-10-02 00:34:53 +02001179static int
Victor Stinner488fa492011-12-12 00:01:39 +01001180unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001181{
Victor Stinner488fa492011-12-12 00:01:39 +01001182 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001183 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001184 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001185 return -1;
1186 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001187 return 0;
1188}
1189
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001190static int
1191_copy_characters(PyObject *to, Py_ssize_t to_start,
1192 PyObject *from, Py_ssize_t from_start,
1193 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001195 unsigned int from_kind, to_kind;
1196 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001197
Victor Stinneree4544c2012-05-09 22:24:08 +02001198 assert(0 <= how_many);
1199 assert(0 <= from_start);
1200 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001201 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204
Victor Stinnerd3f08822012-05-29 12:57:52 +02001205 assert(PyUnicode_Check(to));
1206 assert(PyUnicode_IS_READY(to));
1207 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1208
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001209 if (how_many == 0)
1210 return 0;
1211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001215 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216
Victor Stinnerf1852262012-06-16 16:38:26 +02001217#ifdef Py_DEBUG
1218 if (!check_maxchar
1219 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1220 {
1221 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1222 Py_UCS4 ch;
1223 Py_ssize_t i;
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 assert(ch <= to_maxchar);
1227 }
1228 }
1229#endif
1230
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001231 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001232 if (check_maxchar
1233 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1234 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 /* Writing Latin-1 characters into an ASCII string requires to
1236 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 Py_UCS4 max_char;
1238 max_char = ucs1lib_find_max_char(from_data,
1239 (Py_UCS1*)from_data + how_many);
1240 if (max_char >= 128)
1241 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001243 Py_MEMCPY((char*)to_data + to_kind * to_start,
1244 (char*)from_data + from_kind * from_start,
1245 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001247 else if (from_kind == PyUnicode_1BYTE_KIND
1248 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001249 {
1250 _PyUnicode_CONVERT_BYTES(
1251 Py_UCS1, Py_UCS2,
1252 PyUnicode_1BYTE_DATA(from) + from_start,
1253 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1254 PyUnicode_2BYTE_DATA(to) + to_start
1255 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001256 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001257 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001258 && to_kind == PyUnicode_4BYTE_KIND)
1259 {
1260 _PyUnicode_CONVERT_BYTES(
1261 Py_UCS1, Py_UCS4,
1262 PyUnicode_1BYTE_DATA(from) + from_start,
1263 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1264 PyUnicode_4BYTE_DATA(to) + to_start
1265 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001266 }
1267 else if (from_kind == PyUnicode_2BYTE_KIND
1268 && to_kind == PyUnicode_4BYTE_KIND)
1269 {
1270 _PyUnicode_CONVERT_BYTES(
1271 Py_UCS2, Py_UCS4,
1272 PyUnicode_2BYTE_DATA(from) + from_start,
1273 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1274 PyUnicode_4BYTE_DATA(to) + to_start
1275 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001276 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001277 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001278 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1279
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001280 if (!check_maxchar) {
1281 if (from_kind == PyUnicode_2BYTE_KIND
1282 && to_kind == PyUnicode_1BYTE_KIND)
1283 {
1284 _PyUnicode_CONVERT_BYTES(
1285 Py_UCS2, Py_UCS1,
1286 PyUnicode_2BYTE_DATA(from) + from_start,
1287 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1288 PyUnicode_1BYTE_DATA(to) + to_start
1289 );
1290 }
1291 else if (from_kind == PyUnicode_4BYTE_KIND
1292 && to_kind == PyUnicode_1BYTE_KIND)
1293 {
1294 _PyUnicode_CONVERT_BYTES(
1295 Py_UCS4, Py_UCS1,
1296 PyUnicode_4BYTE_DATA(from) + from_start,
1297 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1298 PyUnicode_1BYTE_DATA(to) + to_start
1299 );
1300 }
1301 else if (from_kind == PyUnicode_4BYTE_KIND
1302 && to_kind == PyUnicode_2BYTE_KIND)
1303 {
1304 _PyUnicode_CONVERT_BYTES(
1305 Py_UCS4, Py_UCS2,
1306 PyUnicode_4BYTE_DATA(from) + from_start,
1307 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1308 PyUnicode_2BYTE_DATA(to) + to_start
1309 );
1310 }
1311 else {
1312 assert(0);
1313 return -1;
1314 }
1315 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001316 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001317 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001318 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001319 Py_ssize_t i;
1320
Victor Stinnera0702ab2011-09-29 14:14:38 +02001321 for (i=0; i < how_many; i++) {
1322 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001323 if (ch > to_maxchar)
1324 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001325 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1326 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001327 }
1328 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001329 return 0;
1330}
1331
Victor Stinnerd3f08822012-05-29 12:57:52 +02001332void
1333_PyUnicode_FastCopyCharacters(
1334 PyObject *to, Py_ssize_t to_start,
1335 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001336{
1337 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1338}
1339
1340Py_ssize_t
1341PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1342 PyObject *from, Py_ssize_t from_start,
1343 Py_ssize_t how_many)
1344{
1345 int err;
1346
1347 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1348 PyErr_BadInternalCall();
1349 return -1;
1350 }
1351
Benjamin Petersonbac79492012-01-14 13:34:47 -05001352 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001353 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001354 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001355 return -1;
1356
Victor Stinnerd3f08822012-05-29 12:57:52 +02001357 if (from_start < 0) {
1358 PyErr_SetString(PyExc_IndexError, "string index out of range");
1359 return -1;
1360 }
1361 if (to_start < 0) {
1362 PyErr_SetString(PyExc_IndexError, "string index out of range");
1363 return -1;
1364 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1366 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1367 PyErr_Format(PyExc_SystemError,
1368 "Cannot write %zi characters at %zi "
1369 "in a string of %zi characters",
1370 how_many, to_start, PyUnicode_GET_LENGTH(to));
1371 return -1;
1372 }
1373
1374 if (how_many == 0)
1375 return 0;
1376
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001378 return -1;
1379
1380 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1381 if (err) {
1382 PyErr_Format(PyExc_SystemError,
1383 "Cannot copy %s characters "
1384 "into a string of %s characters",
1385 unicode_kind_name(from),
1386 unicode_kind_name(to));
1387 return -1;
1388 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390}
1391
Victor Stinner17222162011-09-28 22:15:37 +02001392/* Find the maximum code point and count the number of surrogate pairs so a
1393 correct string length can be computed before converting a string to UCS4.
1394 This function counts single surrogates as a character and not as a pair.
1395
1396 Return 0 on success, or -1 on error. */
1397static int
1398find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1399 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400{
1401 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001402 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403
Victor Stinnerc53be962011-10-02 21:33:54 +02001404 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 *num_surrogates = 0;
1406 *maxchar = 0;
1407
1408 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001410 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1411 && (iter+1) < end
1412 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1413 {
1414 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1415 ++(*num_surrogates);
1416 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 }
1418 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001420 {
1421 ch = *iter;
1422 iter++;
1423 }
1424 if (ch > *maxchar) {
1425 *maxchar = ch;
1426 if (*maxchar > MAX_UNICODE) {
1427 PyErr_Format(PyExc_ValueError,
1428 "character U+%x is not in range [U+0000; U+10ffff]",
1429 ch);
1430 return -1;
1431 }
1432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 return 0;
1435}
1436
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001437int
1438_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439{
1440 wchar_t *end;
1441 Py_UCS4 maxchar = 0;
1442 Py_ssize_t num_surrogates;
1443#if SIZEOF_WCHAR_T == 2
1444 Py_ssize_t length_wo_surrogates;
1445#endif
1446
Georg Brandl7597add2011-10-05 16:36:47 +02001447 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001448 strings were created using _PyObject_New() and where no canonical
1449 representation (the str field) has been set yet aka strings
1450 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001451 assert(_PyUnicode_CHECK(unicode));
1452 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001454 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001455 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001456 /* Actually, it should neither be interned nor be anything else: */
1457 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001460 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
1464 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001465 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1466 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 PyErr_NoMemory();
1468 return -1;
1469 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001470 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 _PyUnicode_WSTR(unicode), end,
1472 PyUnicode_1BYTE_DATA(unicode));
1473 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1474 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1475 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1476 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001477 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001478 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001479 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 }
1481 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001483 _PyUnicode_UTF8(unicode) = NULL;
1484 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 PyObject_FREE(_PyUnicode_WSTR(unicode));
1487 _PyUnicode_WSTR(unicode) = NULL;
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 }
1490 /* In this case we might have to convert down from 4-byte native
1491 wchar_t to 2-byte unicode. */
1492 else if (maxchar < 65536) {
1493 assert(num_surrogates == 0 &&
1494 "FindMaxCharAndNumSurrogatePairs() messed up");
1495
Victor Stinner506f5922011-09-28 22:34:18 +02001496#if SIZEOF_WCHAR_T == 2
1497 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001498 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001499 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1500 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1501 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001502 _PyUnicode_UTF8(unicode) = NULL;
1503 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001504#else
1505 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001507 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001508 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001509 PyErr_NoMemory();
1510 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 }
Victor Stinner506f5922011-09-28 22:34:18 +02001512 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1513 _PyUnicode_WSTR(unicode), end,
1514 PyUnicode_2BYTE_DATA(unicode));
1515 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1516 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1517 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001518 _PyUnicode_UTF8(unicode) = NULL;
1519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001520 PyObject_FREE(_PyUnicode_WSTR(unicode));
1521 _PyUnicode_WSTR(unicode) = NULL;
1522 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1523#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 }
1525 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1526 else {
1527#if SIZEOF_WCHAR_T == 2
1528 /* in case the native representation is 2-bytes, we need to allocate a
1529 new normalized 4-byte version. */
1530 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001531 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1532 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 PyErr_NoMemory();
1534 return -1;
1535 }
1536 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1537 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001538 _PyUnicode_UTF8(unicode) = NULL;
1539 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001540 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1541 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001542 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543 PyObject_FREE(_PyUnicode_WSTR(unicode));
1544 _PyUnicode_WSTR(unicode) = NULL;
1545 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1546#else
1547 assert(num_surrogates == 0);
1548
Victor Stinnerc3c74152011-10-02 20:39:55 +02001549 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001551 _PyUnicode_UTF8(unicode) = NULL;
1552 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1554#endif
1555 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1556 }
1557 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001558 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 return 0;
1560}
1561
Alexander Belopolsky40018472011-02-26 01:02:56 +00001562static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001563unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564{
Walter Dörwald16807132007-05-25 13:52:07 +00001565 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001566 case SSTATE_NOT_INTERNED:
1567 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001568
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 case SSTATE_INTERNED_MORTAL:
1570 /* revive dead object temporarily for DelItem */
1571 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001572 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 Py_FatalError(
1574 "deletion of interned string failed");
1575 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001576
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 case SSTATE_INTERNED_IMMORTAL:
1578 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001579
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 default:
1581 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001582 }
1583
Victor Stinner03490912011-10-03 23:45:12 +02001584 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001586 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001587 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001588 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1589 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001591 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592}
1593
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001594#ifdef Py_DEBUG
1595static int
1596unicode_is_singleton(PyObject *unicode)
1597{
1598 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1599 if (unicode == unicode_empty)
1600 return 1;
1601 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1602 {
1603 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1604 if (ch < 256 && unicode_latin1[ch] == unicode)
1605 return 1;
1606 }
1607 return 0;
1608}
1609#endif
1610
Alexander Belopolsky40018472011-02-26 01:02:56 +00001611static int
Victor Stinner488fa492011-12-12 00:01:39 +01001612unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001613{
Victor Stinner488fa492011-12-12 00:01:39 +01001614 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 if (Py_REFCNT(unicode) != 1)
1616 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001617 if (_PyUnicode_HASH(unicode) != -1)
1618 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001619 if (PyUnicode_CHECK_INTERNED(unicode))
1620 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001621 if (!PyUnicode_CheckExact(unicode))
1622 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001623#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001624 /* singleton refcount is greater than 1 */
1625 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001626#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001627 return 1;
1628}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629
Victor Stinnerfe226c02011-10-03 03:52:20 +02001630static int
1631unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1632{
1633 PyObject *unicode;
1634 Py_ssize_t old_length;
1635
1636 assert(p_unicode != NULL);
1637 unicode = *p_unicode;
1638
1639 assert(unicode != NULL);
1640 assert(PyUnicode_Check(unicode));
1641 assert(0 <= length);
1642
Victor Stinner910337b2011-10-03 03:20:16 +02001643 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001644 old_length = PyUnicode_WSTR_LENGTH(unicode);
1645 else
1646 old_length = PyUnicode_GET_LENGTH(unicode);
1647 if (old_length == length)
1648 return 0;
1649
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001650 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001651 _Py_INCREF_UNICODE_EMPTY();
1652 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001653 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 Py_DECREF(*p_unicode);
1655 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001656 return 0;
1657 }
1658
Victor Stinner488fa492011-12-12 00:01:39 +01001659 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001660 PyObject *copy = resize_copy(unicode, length);
1661 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001663 Py_DECREF(*p_unicode);
1664 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001665 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666 }
1667
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001669 PyObject *new_unicode = resize_compact(unicode, length);
1670 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001671 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001672 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001674 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001675 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001676}
1677
Alexander Belopolsky40018472011-02-26 01:02:56 +00001678int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001679PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001680{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001681 PyObject *unicode;
1682 if (p_unicode == NULL) {
1683 PyErr_BadInternalCall();
1684 return -1;
1685 }
1686 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001687 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688 {
1689 PyErr_BadInternalCall();
1690 return -1;
1691 }
1692 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001693}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001694
Victor Stinnerc5166102012-02-22 13:55:02 +01001695/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001696
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001697 WARNING: The function doesn't copy the terminating null character and
1698 doesn't check the maximum character (may write a latin1 character in an
1699 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001700static void
1701unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1702 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001703{
1704 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1705 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001706 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001707
1708 switch (kind) {
1709 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001710 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001711#ifdef Py_DEBUG
1712 if (PyUnicode_IS_ASCII(unicode)) {
1713 Py_UCS4 maxchar = ucs1lib_find_max_char(
1714 (const Py_UCS1*)str,
1715 (const Py_UCS1*)str + len);
1716 assert(maxchar < 128);
1717 }
1718#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001719 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001720 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001721 }
1722 case PyUnicode_2BYTE_KIND: {
1723 Py_UCS2 *start = (Py_UCS2 *)data + index;
1724 Py_UCS2 *ucs2 = start;
1725 assert(index <= PyUnicode_GET_LENGTH(unicode));
1726
Victor Stinner184252a2012-06-16 02:57:41 +02001727 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 *ucs2 = (Py_UCS2)*str;
1729
1730 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001731 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 }
1733 default: {
1734 Py_UCS4 *start = (Py_UCS4 *)data + index;
1735 Py_UCS4 *ucs4 = start;
1736 assert(kind == PyUnicode_4BYTE_KIND);
1737 assert(index <= PyUnicode_GET_LENGTH(unicode));
1738
Victor Stinner184252a2012-06-16 02:57:41 +02001739 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001740 *ucs4 = (Py_UCS4)*str;
1741
1742 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001743 }
1744 }
1745}
1746
1747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748static PyObject*
1749get_latin1_char(unsigned char ch)
1750{
Victor Stinnera464fc12011-10-02 20:39:30 +02001751 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001753 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 if (!unicode)
1755 return NULL;
1756 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001757 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 unicode_latin1[ch] = unicode;
1759 }
1760 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001761 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Alexander Belopolsky40018472011-02-26 01:02:56 +00001764PyObject *
1765PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001767 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 Py_UCS4 maxchar = 0;
1769 Py_ssize_t num_surrogates;
1770
1771 if (u == NULL)
1772 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001774 /* If the Unicode data is known at construction time, we can apply
1775 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001778 if (size == 0)
1779 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 /* Single character Unicode objects in the Latin-1 range are
1782 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001783 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 return get_latin1_char((unsigned char)*u);
1785
1786 /* If not empty and not single character, copy the Unicode data
1787 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001788 if (find_maxchar_surrogates(u, u + size,
1789 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 return NULL;
1791
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 if (!unicode)
1794 return NULL;
1795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 switch (PyUnicode_KIND(unicode)) {
1797 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001798 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1800 break;
1801 case PyUnicode_2BYTE_KIND:
1802#if Py_UNICODE_SIZE == 2
1803 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1804#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001805 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1807#endif
1808 break;
1809 case PyUnicode_4BYTE_KIND:
1810#if SIZEOF_WCHAR_T == 2
1811 /* This is the only case which has to process surrogates, thus
1812 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001813 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814#else
1815 assert(num_surrogates == 0);
1816 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1817#endif
1818 break;
1819 default:
1820 assert(0 && "Impossible state");
1821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001823 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824}
1825
Alexander Belopolsky40018472011-02-26 01:02:56 +00001826PyObject *
1827PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001828{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001829 if (size < 0) {
1830 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001831 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 return NULL;
1833 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001834 if (u != NULL)
1835 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1836 else
1837 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001838}
1839
Alexander Belopolsky40018472011-02-26 01:02:56 +00001840PyObject *
1841PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001842{
1843 size_t size = strlen(u);
1844 if (size > PY_SSIZE_T_MAX) {
1845 PyErr_SetString(PyExc_OverflowError, "input too long");
1846 return NULL;
1847 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001848 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001849}
1850
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001851PyObject *
1852_PyUnicode_FromId(_Py_Identifier *id)
1853{
1854 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001855 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1856 strlen(id->string),
1857 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001858 if (!id->object)
1859 return NULL;
1860 PyUnicode_InternInPlace(&id->object);
1861 assert(!id->next);
1862 id->next = static_strings;
1863 static_strings = id;
1864 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001865 return id->object;
1866}
1867
1868void
1869_PyUnicode_ClearStaticStrings()
1870{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001871 _Py_Identifier *tmp, *s = static_strings;
1872 while (s) {
1873 Py_DECREF(s->object);
1874 s->object = NULL;
1875 tmp = s->next;
1876 s->next = NULL;
1877 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001878 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001879 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001880}
1881
Benjamin Peterson0df54292012-03-26 14:50:32 -04001882/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001883
Victor Stinnerd3f08822012-05-29 12:57:52 +02001884PyObject*
1885_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001886{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001887 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001888 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001889 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001890#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001891 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001892#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001893 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001894 }
Victor Stinner785938e2011-12-11 20:09:03 +01001895 unicode = PyUnicode_New(size, 127);
1896 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001897 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001898 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1899 assert(_PyUnicode_CheckConsistency(unicode, 1));
1900 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001901}
1902
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001903static Py_UCS4
1904kind_maxchar_limit(unsigned int kind)
1905{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001906 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907 case PyUnicode_1BYTE_KIND:
1908 return 0x80;
1909 case PyUnicode_2BYTE_KIND:
1910 return 0x100;
1911 case PyUnicode_4BYTE_KIND:
1912 return 0x10000;
1913 default:
1914 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001915 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001916 }
1917}
1918
Victor Stinnere6abb482012-05-02 01:15:40 +02001919Py_LOCAL_INLINE(Py_UCS4)
1920align_maxchar(Py_UCS4 maxchar)
1921{
1922 if (maxchar <= 127)
1923 return 127;
1924 else if (maxchar <= 255)
1925 return 255;
1926 else if (maxchar <= 65535)
1927 return 65535;
1928 else
1929 return MAX_UNICODE;
1930}
1931
Victor Stinner702c7342011-10-05 13:50:52 +02001932static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001933_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001937
Serhiy Storchaka678db842013-01-26 12:16:36 +02001938 if (size == 0)
1939 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001940 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001941 if (size == 1)
1942 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001943
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001945 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 if (!res)
1947 return NULL;
1948 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001949 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001951}
1952
Victor Stinnere57b1c02011-09-28 22:20:48 +02001953static PyObject*
1954_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955{
1956 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001957 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001958
Serhiy Storchaka678db842013-01-26 12:16:36 +02001959 if (size == 0)
1960 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001961 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001962 if (size == 1) {
1963 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001964 int kind;
1965 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001966 if (ch < 256)
1967 return get_latin1_char((unsigned char)ch);
1968
1969 res = PyUnicode_New(1, ch);
1970 if (res == NULL)
1971 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001972 kind = PyUnicode_KIND(res);
1973 data = PyUnicode_DATA(res);
1974 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001975 assert(_PyUnicode_CheckConsistency(res, 1));
1976 return res;
1977 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001979 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 if (!res)
1982 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001983 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001985 else {
1986 _PyUnicode_CONVERT_BYTES(
1987 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1988 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001989 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 return res;
1991}
1992
Victor Stinnere57b1c02011-09-28 22:20:48 +02001993static PyObject*
1994_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995{
1996 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998
Serhiy Storchaka678db842013-01-26 12:16:36 +02001999 if (size == 0)
2000 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002002 if (size == 1) {
2003 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002004 int kind;
2005 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002006 if (ch < 256)
2007 return get_latin1_char((unsigned char)ch);
2008
2009 res = PyUnicode_New(1, ch);
2010 if (res == NULL)
2011 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002012 kind = PyUnicode_KIND(res);
2013 data = PyUnicode_DATA(res);
2014 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002015 assert(_PyUnicode_CheckConsistency(res, 1));
2016 return res;
2017 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002019 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002020 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 if (!res)
2022 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002023 if (max_char < 256)
2024 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2025 PyUnicode_1BYTE_DATA(res));
2026 else if (max_char < 0x10000)
2027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2028 PyUnicode_2BYTE_DATA(res));
2029 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002031 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 return res;
2033}
2034
2035PyObject*
2036PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2037{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002038 if (size < 0) {
2039 PyErr_SetString(PyExc_ValueError, "size must be positive");
2040 return NULL;
2041 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002042 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002044 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002046 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002048 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002049 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002050 PyErr_SetString(PyExc_SystemError, "invalid kind");
2051 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053}
2054
Victor Stinnerece58de2012-04-23 23:36:38 +02002055Py_UCS4
2056_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2057{
2058 enum PyUnicode_Kind kind;
2059 void *startptr, *endptr;
2060
2061 assert(PyUnicode_IS_READY(unicode));
2062 assert(0 <= start);
2063 assert(end <= PyUnicode_GET_LENGTH(unicode));
2064 assert(start <= end);
2065
2066 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2067 return PyUnicode_MAX_CHAR_VALUE(unicode);
2068
2069 if (start == end)
2070 return 127;
2071
Victor Stinner94d558b2012-04-27 22:26:58 +02002072 if (PyUnicode_IS_ASCII(unicode))
2073 return 127;
2074
Victor Stinnerece58de2012-04-23 23:36:38 +02002075 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002076 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002077 endptr = (char *)startptr + end * kind;
2078 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002079 switch(kind) {
2080 case PyUnicode_1BYTE_KIND:
2081 return ucs1lib_find_max_char(startptr, endptr);
2082 case PyUnicode_2BYTE_KIND:
2083 return ucs2lib_find_max_char(startptr, endptr);
2084 case PyUnicode_4BYTE_KIND:
2085 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002086 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002087 assert(0);
2088 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 }
2090}
2091
Victor Stinner25a4b292011-10-06 12:31:55 +02002092/* Ensure that a string uses the most efficient storage, if it is not the
2093 case: create a new string with of the right kind. Write NULL into *p_unicode
2094 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002095static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002096unicode_adjust_maxchar(PyObject **p_unicode)
2097{
2098 PyObject *unicode, *copy;
2099 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002100 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002101 unsigned int kind;
2102
2103 assert(p_unicode != NULL);
2104 unicode = *p_unicode;
2105 assert(PyUnicode_IS_READY(unicode));
2106 if (PyUnicode_IS_ASCII(unicode))
2107 return;
2108
2109 len = PyUnicode_GET_LENGTH(unicode);
2110 kind = PyUnicode_KIND(unicode);
2111 if (kind == PyUnicode_1BYTE_KIND) {
2112 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002113 max_char = ucs1lib_find_max_char(u, u + len);
2114 if (max_char >= 128)
2115 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002116 }
2117 else if (kind == PyUnicode_2BYTE_KIND) {
2118 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002119 max_char = ucs2lib_find_max_char(u, u + len);
2120 if (max_char >= 256)
2121 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 }
2123 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002124 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002125 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002126 max_char = ucs4lib_find_max_char(u, u + len);
2127 if (max_char >= 0x10000)
2128 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002131 if (copy != NULL)
2132 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 Py_DECREF(unicode);
2134 *p_unicode = copy;
2135}
2136
Victor Stinner034f6cf2011-09-30 02:26:44 +02002137PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002138_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002139{
Victor Stinner87af4f22011-11-21 23:03:47 +01002140 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002141 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002142
Victor Stinner034f6cf2011-09-30 02:26:44 +02002143 if (!PyUnicode_Check(unicode)) {
2144 PyErr_BadInternalCall();
2145 return NULL;
2146 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002147 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002148 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149
Victor Stinner87af4f22011-11-21 23:03:47 +01002150 length = PyUnicode_GET_LENGTH(unicode);
2151 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152 if (!copy)
2153 return NULL;
2154 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2155
Victor Stinner87af4f22011-11-21 23:03:47 +01002156 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2157 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002158 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002159 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002160}
2161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163/* Widen Unicode objects to larger buffers. Don't write terminating null
2164 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165
2166void*
2167_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2168{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002169 Py_ssize_t len;
2170 void *result;
2171 unsigned int skind;
2172
Benjamin Petersonbac79492012-01-14 13:34:47 -05002173 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 return NULL;
2175
2176 len = PyUnicode_GET_LENGTH(s);
2177 skind = PyUnicode_KIND(s);
2178 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002179 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return NULL;
2181 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002182 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 case PyUnicode_2BYTE_KIND:
2184 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2185 if (!result)
2186 return PyErr_NoMemory();
2187 assert(skind == PyUnicode_1BYTE_KIND);
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS1, Py_UCS2,
2190 PyUnicode_1BYTE_DATA(s),
2191 PyUnicode_1BYTE_DATA(s) + len,
2192 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002194 case PyUnicode_4BYTE_KIND:
2195 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2196 if (!result)
2197 return PyErr_NoMemory();
2198 if (skind == PyUnicode_2BYTE_KIND) {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS4,
2201 PyUnicode_2BYTE_DATA(s),
2202 PyUnicode_2BYTE_DATA(s) + len,
2203 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002205 else {
2206 assert(skind == PyUnicode_1BYTE_KIND);
2207 _PyUnicode_CONVERT_BYTES(
2208 Py_UCS1, Py_UCS4,
2209 PyUnicode_1BYTE_DATA(s),
2210 PyUnicode_1BYTE_DATA(s) + len,
2211 result);
2212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002214 default:
2215 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 }
Victor Stinner01698042011-10-04 00:04:26 +02002217 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return NULL;
2219}
2220
2221static Py_UCS4*
2222as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2223 int copy_null)
2224{
2225 int kind;
2226 void *data;
2227 Py_ssize_t len, targetlen;
2228 if (PyUnicode_READY(string) == -1)
2229 return NULL;
2230 kind = PyUnicode_KIND(string);
2231 data = PyUnicode_DATA(string);
2232 len = PyUnicode_GET_LENGTH(string);
2233 targetlen = len;
2234 if (copy_null)
2235 targetlen++;
2236 if (!target) {
2237 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2238 PyErr_NoMemory();
2239 return NULL;
2240 }
2241 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2242 if (!target) {
2243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 }
2247 else {
2248 if (targetsize < targetlen) {
2249 PyErr_Format(PyExc_SystemError,
2250 "string is longer than the buffer");
2251 if (copy_null && 0 < targetsize)
2252 target[0] = 0;
2253 return NULL;
2254 }
2255 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 if (kind == PyUnicode_1BYTE_KIND) {
2257 Py_UCS1 *start = (Py_UCS1 *) data;
2258 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 else if (kind == PyUnicode_2BYTE_KIND) {
2261 Py_UCS2 *start = (Py_UCS2 *) data;
2262 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2263 }
2264 else {
2265 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 if (copy_null)
2269 target[len] = 0;
2270 return target;
2271}
2272
2273Py_UCS4*
2274PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2275 int copy_null)
2276{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002277 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 PyErr_BadInternalCall();
2279 return NULL;
2280 }
2281 return as_ucs4(string, target, targetsize, copy_null);
2282}
2283
2284Py_UCS4*
2285PyUnicode_AsUCS4Copy(PyObject *string)
2286{
2287 return as_ucs4(string, NULL, 0, 1);
2288}
2289
2290#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002291
Alexander Belopolsky40018472011-02-26 01:02:56 +00002292PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002293PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002297 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 PyErr_BadInternalCall();
2299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 }
2301
Martin v. Löwis790465f2008-04-05 20:41:37 +00002302 if (size == -1) {
2303 size = wcslen(w);
2304 }
2305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307}
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002310
Walter Dörwald346737f2007-05-31 10:44:43 +00002311static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002312makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002313 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002314{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002315 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002316 if (longflag)
2317 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002318 else if (longlongflag) {
2319 /* longlongflag should only ever be nonzero on machines with
2320 HAVE_LONG_LONG defined */
2321#ifdef HAVE_LONG_LONG
2322 char *f = PY_FORMAT_LONG_LONG;
2323 while (*f)
2324 *fmt++ = *f++;
2325#else
2326 /* we shouldn't ever get here */
2327 assert(0);
2328 *fmt++ = 'l';
2329#endif
2330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002331 else if (size_tflag) {
2332 char *f = PY_FORMAT_SIZE_T;
2333 while (*f)
2334 *fmt++ = *f++;
2335 }
2336 *fmt++ = c;
2337 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002338}
2339
Victor Stinner15a11362012-10-06 23:48:20 +02002340/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002341 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2342 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2343#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002344
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002345static int
2346unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2347 Py_ssize_t width, Py_ssize_t precision)
2348{
2349 Py_ssize_t length, fill, arglen;
2350 Py_UCS4 maxchar;
2351
2352 if (PyUnicode_READY(str) == -1)
2353 return -1;
2354
2355 length = PyUnicode_GET_LENGTH(str);
2356 if ((precision == -1 || precision >= length)
2357 && width <= length)
2358 return _PyUnicodeWriter_WriteStr(writer, str);
2359
2360 if (precision != -1)
2361 length = Py_MIN(precision, length);
2362
2363 arglen = Py_MAX(length, width);
2364 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2365 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2366 else
2367 maxchar = writer->maxchar;
2368
2369 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2370 return -1;
2371
2372 if (width > length) {
2373 fill = width - length;
2374 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2375 return -1;
2376 writer->pos += fill;
2377 }
2378
2379 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2380 str, 0, length);
2381 writer->pos += length;
2382 return 0;
2383}
2384
2385static int
2386unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2387 Py_ssize_t width, Py_ssize_t precision)
2388{
2389 /* UTF-8 */
2390 Py_ssize_t length;
2391 PyObject *unicode;
2392 int res;
2393
2394 length = strlen(str);
2395 if (precision != -1)
2396 length = Py_MIN(length, precision);
2397 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2398 if (unicode == NULL)
2399 return -1;
2400
2401 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2402 Py_DECREF(unicode);
2403 return res;
2404}
2405
Victor Stinner96865452011-03-01 23:44:09 +00002406static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002407unicode_fromformat_arg(_PyUnicodeWriter *writer,
2408 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002409{
Victor Stinnere215d962012-10-06 23:03:36 +02002410 const char *p;
2411 Py_ssize_t len;
2412 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 Py_ssize_t width;
2414 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002415 int longflag;
2416 int longlongflag;
2417 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002419
2420 p = f;
2421 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002422 zeropad = 0;
2423 if (*f == '0') {
2424 zeropad = 1;
2425 f++;
2426 }
Victor Stinner96865452011-03-01 23:44:09 +00002427
2428 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002429 width = -1;
2430 if (Py_ISDIGIT((unsigned)*f)) {
2431 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002432 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002433 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002434 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002435 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002436 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002437 return NULL;
2438 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002440 f++;
2441 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002442 }
2443 precision = -1;
2444 if (*f == '.') {
2445 f++;
2446 if (Py_ISDIGIT((unsigned)*f)) {
2447 precision = (*f - '0');
2448 f++;
2449 while (Py_ISDIGIT((unsigned)*f)) {
2450 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2451 PyErr_SetString(PyExc_ValueError,
2452 "precision too big");
2453 return NULL;
2454 }
2455 precision = (precision * 10) + (*f - '0');
2456 f++;
2457 }
2458 }
Victor Stinner96865452011-03-01 23:44:09 +00002459 if (*f == '%') {
2460 /* "%.3%s" => f points to "3" */
2461 f--;
2462 }
2463 }
2464 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002465 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002466 f--;
2467 }
Victor Stinner96865452011-03-01 23:44:09 +00002468
2469 /* Handle %ld, %lu, %lld and %llu. */
2470 longflag = 0;
2471 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002472 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002473 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002474 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002475 longflag = 1;
2476 ++f;
2477 }
2478#ifdef HAVE_LONG_LONG
2479 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002480 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002481 longlongflag = 1;
2482 f += 2;
2483 }
2484#endif
2485 }
2486 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002487 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002488 size_tflag = 1;
2489 ++f;
2490 }
Victor Stinnere215d962012-10-06 23:03:36 +02002491
2492 if (f[1] == '\0')
2493 writer->overallocate = 0;
2494
2495 switch (*f) {
2496 case 'c':
2497 {
2498 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002499 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002500 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002501 "character argument not in range(0x110000)");
2502 return NULL;
2503 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002504 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002505 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002506 break;
2507 }
2508
2509 case 'i':
2510 case 'd':
2511 case 'u':
2512 case 'x':
2513 {
2514 /* used by sprintf */
2515 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002516 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002517 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002518
2519 if (*f == 'u') {
2520 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2521
2522 if (longflag)
2523 len = sprintf(buffer, fmt,
2524 va_arg(*vargs, unsigned long));
2525#ifdef HAVE_LONG_LONG
2526 else if (longlongflag)
2527 len = sprintf(buffer, fmt,
2528 va_arg(*vargs, unsigned PY_LONG_LONG));
2529#endif
2530 else if (size_tflag)
2531 len = sprintf(buffer, fmt,
2532 va_arg(*vargs, size_t));
2533 else
2534 len = sprintf(buffer, fmt,
2535 va_arg(*vargs, unsigned int));
2536 }
2537 else if (*f == 'x') {
2538 makefmt(fmt, 0, 0, 0, 'x');
2539 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2540 }
2541 else {
2542 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2543
2544 if (longflag)
2545 len = sprintf(buffer, fmt,
2546 va_arg(*vargs, long));
2547#ifdef HAVE_LONG_LONG
2548 else if (longlongflag)
2549 len = sprintf(buffer, fmt,
2550 va_arg(*vargs, PY_LONG_LONG));
2551#endif
2552 else if (size_tflag)
2553 len = sprintf(buffer, fmt,
2554 va_arg(*vargs, Py_ssize_t));
2555 else
2556 len = sprintf(buffer, fmt,
2557 va_arg(*vargs, int));
2558 }
2559 assert(len >= 0);
2560
Victor Stinnere215d962012-10-06 23:03:36 +02002561 if (precision < len)
2562 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002563
2564 arglen = Py_MAX(precision, width);
2565 assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127);
2566 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2567 return NULL;
2568
Victor Stinnere215d962012-10-06 23:03:36 +02002569 if (width > precision) {
2570 Py_UCS4 fillchar;
2571 fill = width - precision;
2572 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002573 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2574 return NULL;
2575 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002576 }
Victor Stinner15a11362012-10-06 23:48:20 +02002577 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002578 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002579 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2580 return NULL;
2581 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002582 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583
2584 unicode_write_cstr(writer->buffer, writer->pos, buffer, len);
2585 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002586 break;
2587 }
2588
2589 case 'p':
2590 {
2591 char number[MAX_LONG_LONG_CHARS];
2592
2593 len = sprintf(number, "%p", va_arg(*vargs, void*));
2594 assert(len >= 0);
2595
2596 /* %p is ill-defined: ensure leading 0x. */
2597 if (number[1] == 'X')
2598 number[1] = 'x';
2599 else if (number[1] != 'x') {
2600 memmove(number + 2, number,
2601 strlen(number) + 1);
2602 number[0] = '0';
2603 number[1] = 'x';
2604 len += 2;
2605 }
2606
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002607 assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127);
2608 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002609 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002610 unicode_write_cstr(writer->buffer, writer->pos, number, len);
2611 writer->pos += len;
Victor Stinnere215d962012-10-06 23:03:36 +02002612 break;
2613 }
2614
2615 case 's':
2616 {
2617 /* UTF-8 */
2618 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002620 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002621 break;
2622 }
2623
2624 case 'U':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 assert(obj && _PyUnicode_CHECK(obj));
2628
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002630 return NULL;
2631 break;
2632 }
2633
2634 case 'V':
2635 {
2636 PyObject *obj = va_arg(*vargs, PyObject *);
2637 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002638 if (obj) {
2639 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002640 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002641 return NULL;
2642 }
2643 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 assert(str != NULL);
2645 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002646 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002647 }
2648 break;
2649 }
2650
2651 case 'S':
2652 {
2653 PyObject *obj = va_arg(*vargs, PyObject *);
2654 PyObject *str;
2655 assert(obj);
2656 str = PyObject_Str(obj);
2657 if (!str)
2658 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002659 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002660 Py_DECREF(str);
2661 return NULL;
2662 }
2663 Py_DECREF(str);
2664 break;
2665 }
2666
2667 case 'R':
2668 {
2669 PyObject *obj = va_arg(*vargs, PyObject *);
2670 PyObject *repr;
2671 assert(obj);
2672 repr = PyObject_Repr(obj);
2673 if (!repr)
2674 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 Py_DECREF(repr);
2677 return NULL;
2678 }
2679 Py_DECREF(repr);
2680 break;
2681 }
2682
2683 case 'A':
2684 {
2685 PyObject *obj = va_arg(*vargs, PyObject *);
2686 PyObject *ascii;
2687 assert(obj);
2688 ascii = PyObject_ASCII(obj);
2689 if (!ascii)
2690 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002692 Py_DECREF(ascii);
2693 return NULL;
2694 }
2695 Py_DECREF(ascii);
2696 break;
2697 }
2698
2699 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002700 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002701 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002702 break;
2703
2704 default:
2705 /* if we stumble upon an unknown formatting code, copy the rest
2706 of the format string to the output string. (we cannot just
2707 skip the code, since there's no way to know what's in the
2708 argument list) */
2709 len = strlen(p);
2710 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2711 return NULL;
2712 f = p+len;
2713 return f;
2714 }
2715
2716 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002717 return f;
2718}
2719
Walter Dörwaldd2034312007-05-18 16:29:38 +00002720PyObject *
2721PyUnicode_FromFormatV(const char *format, va_list vargs)
2722{
Victor Stinnere215d962012-10-06 23:03:36 +02002723 va_list vargs2;
2724 const char *f;
2725 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002726
Victor Stinner8f674cc2013-04-17 23:02:17 +02002727 _PyUnicodeWriter_Init(&writer);
2728 writer.min_length = strlen(format) + 100;
2729 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002730
2731 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2732 Copy it to be able to pass a reference to a subfunction. */
2733 Py_VA_COPY(vargs2, vargs);
2734
2735 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002737 f = unicode_fromformat_arg(&writer, f, &vargs2);
2738 if (f == NULL)
2739 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 const char *p;
2743 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002744
Victor Stinnere215d962012-10-06 23:03:36 +02002745 p = f;
2746 do
2747 {
2748 if ((unsigned char)*p > 127) {
2749 PyErr_Format(PyExc_ValueError,
2750 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2751 "string, got a non-ASCII byte: 0x%02x",
2752 (unsigned char)*p);
2753 return NULL;
2754 }
2755 p++;
2756 }
2757 while (*p != '\0' && *p != '%');
2758 len = p - f;
2759
2760 if (*p == '\0')
2761 writer.overallocate = 0;
2762 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2763 goto fail;
2764 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2765 writer.pos += len;
2766
2767 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 }
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return _PyUnicodeWriter_Finish(&writer);
2771
2772 fail:
2773 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775}
2776
Walter Dörwaldd2034312007-05-18 16:29:38 +00002777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002780 PyObject* ret;
2781 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782
2783#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 ret = PyUnicode_FromFormatV(format, vargs);
2789 va_end(vargs);
2790 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002791}
2792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793#ifdef HAVE_WCHAR_H
2794
Victor Stinner5593d8a2010-10-02 11:11:27 +00002795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796 convert a Unicode object to a wide character string.
2797
Victor Stinnerd88d9832011-09-06 02:00:05 +02002798 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002799 character) required to convert the unicode object. Ignore size argument.
2800
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002803 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002806 wchar_t *w,
2807 Py_ssize_t size)
2808{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 const wchar_t *wstr;
2811
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (wstr == NULL)
2814 return -1;
2815
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002817 if (size > res)
2818 size = res + 1;
2819 else
2820 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002822 return res;
2823 }
2824 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002826}
2827
2828Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002829PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 wchar_t *w,
2831 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832{
2833 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 PyErr_BadInternalCall();
2835 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002837 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838}
2839
Victor Stinner137c34c2010-09-29 10:25:54 +00002840wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002841PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002842 Py_ssize_t *size)
2843{
2844 wchar_t* buffer;
2845 Py_ssize_t buflen;
2846
2847 if (unicode == NULL) {
2848 PyErr_BadInternalCall();
2849 return NULL;
2850 }
2851
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002852 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002853 if (buflen == -1)
2854 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002855 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002856 PyErr_NoMemory();
2857 return NULL;
2858 }
2859
Victor Stinner137c34c2010-09-29 10:25:54 +00002860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2861 if (buffer == NULL) {
2862 PyErr_NoMemory();
2863 return NULL;
2864 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002865 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002866 if (buflen == -1) {
2867 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002869 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 if (size != NULL)
2871 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002872 return buffer;
2873}
2874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002875#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Alexander Belopolsky40018472011-02-26 01:02:56 +00002877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002881 void *data;
2882 int kind;
2883
Victor Stinner8faf8212011-12-08 22:14:11 +01002884 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 PyErr_SetString(PyExc_ValueError,
2886 "chr() arg not in range(0x110000)");
2887 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002888 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002889
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002890 if ((Py_UCS4)ordinal < 256)
2891 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 v = PyUnicode_New(1, ordinal);
2894 if (v == NULL)
2895 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002896 kind = PyUnicode_KIND(v);
2897 data = PyUnicode_DATA(v);
2898 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002899 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002901}
2902
Alexander Belopolsky40018472011-02-26 01:02:56 +00002903PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002904PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002906 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002908 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002909 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002910 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 Py_INCREF(obj);
2912 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002913 }
2914 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 /* For a Unicode subtype that's not a Unicode object,
2916 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002917 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002918 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002919 PyErr_Format(PyExc_TypeError,
2920 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002921 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002922 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002923}
2924
Alexander Belopolsky40018472011-02-26 01:02:56 +00002925PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002926PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002927 const char *encoding,
2928 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002929{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002930 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002931 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002932
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 PyErr_BadInternalCall();
2935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Decoding bytes objects is the most common case and should be fast */
2939 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002940 if (PyBytes_GET_SIZE(obj) == 0)
2941 _Py_RETURN_UNICODE_EMPTY();
2942 v = PyUnicode_Decode(
2943 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2944 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002945 return v;
2946 }
2947
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002948 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 PyErr_SetString(PyExc_TypeError,
2950 "decoding str is not supported");
2951 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002953
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002954 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2955 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2956 PyErr_Format(PyExc_TypeError,
2957 "coercing to str: need bytes, bytearray "
2958 "or buffer-like object, %.80s found",
2959 Py_TYPE(obj)->tp_name);
2960 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002961 }
Tim Petersced69f82003-09-16 20:30:58 +00002962
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002963 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002964 PyBuffer_Release(&buffer);
2965 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002967
Serhiy Storchaka05997252013-01-26 12:14:02 +02002968 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002969 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002970 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971}
2972
Victor Stinner600d3be2010-06-10 12:00:55 +00002973/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002974 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2975 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002976int
2977_Py_normalize_encoding(const char *encoding,
2978 char *lower,
2979 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002981 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002982 char *l;
2983 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002985 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002986 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002987 if (lower_len < 6)
2988 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002989 strcpy(lower, "utf-8");
2990 return 1;
2991 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002992 e = encoding;
2993 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002994 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002995 while (*e) {
2996 if (l == l_end)
2997 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002998 if (Py_ISUPPER(*e)) {
2999 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003000 }
3001 else if (*e == '_') {
3002 *l++ = '-';
3003 e++;
3004 }
3005 else {
3006 *l++ = *e++;
3007 }
3008 }
3009 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003010 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003011}
3012
Alexander Belopolsky40018472011-02-26 01:02:56 +00003013PyObject *
3014PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003015 Py_ssize_t size,
3016 const char *encoding,
3017 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003018{
3019 PyObject *buffer = NULL, *unicode;
3020 Py_buffer info;
3021 char lower[11]; /* Enough for any encoding shortcut */
3022
Fred Drakee4315f52000-05-09 19:53:39 +00003023 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003024 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003025 if ((strcmp(lower, "utf-8") == 0) ||
3026 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003027 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003028 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003029 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003030 (strcmp(lower, "iso-8859-1") == 0) ||
3031 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003032 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003033#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003034 else if (strcmp(lower, "mbcs") == 0)
3035 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003036#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003037 else if (strcmp(lower, "ascii") == 0)
3038 return PyUnicode_DecodeASCII(s, size, errors);
3039 else if (strcmp(lower, "utf-16") == 0)
3040 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3041 else if (strcmp(lower, "utf-32") == 0)
3042 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044
3045 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003046 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003047 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003048 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003049 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 if (buffer == NULL)
3051 goto onError;
3052 unicode = PyCodec_Decode(buffer, encoding, errors);
3053 if (unicode == NULL)
3054 goto onError;
3055 if (!PyUnicode_Check(unicode)) {
3056 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003057 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3058 "use codecs.decode() to decode to arbitrary types",
3059 encoding,
3060 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 Py_DECREF(unicode);
3062 goto onError;
3063 }
3064 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003065 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003066
Benjamin Peterson29060642009-01-31 22:14:21 +00003067 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 Py_XDECREF(buffer);
3069 return NULL;
3070}
3071
Alexander Belopolsky40018472011-02-26 01:02:56 +00003072PyObject *
3073PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003074 const char *encoding,
3075 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076{
3077 PyObject *v;
3078
3079 if (!PyUnicode_Check(unicode)) {
3080 PyErr_BadArgument();
3081 goto onError;
3082 }
3083
3084 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003085 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003086
3087 /* Decode via the codec registry */
3088 v = PyCodec_Decode(unicode, encoding, errors);
3089 if (v == NULL)
3090 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003091 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003092
Benjamin Peterson29060642009-01-31 22:14:21 +00003093 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003094 return NULL;
3095}
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
3098PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003099 const char *encoding,
3100 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003101{
3102 PyObject *v;
3103
3104 if (!PyUnicode_Check(unicode)) {
3105 PyErr_BadArgument();
3106 goto onError;
3107 }
3108
3109 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003110 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003111
3112 /* Decode via the codec registry */
3113 v = PyCodec_Decode(unicode, encoding, errors);
3114 if (v == NULL)
3115 goto onError;
3116 if (!PyUnicode_Check(v)) {
3117 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003118 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3119 "use codecs.decode() to decode to arbitrary types",
3120 encoding,
3121 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003122 Py_DECREF(v);
3123 goto onError;
3124 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003125 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003126
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003128 return NULL;
3129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
3132PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003133 Py_ssize_t size,
3134 const char *encoding,
3135 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136{
3137 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003138
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 unicode = PyUnicode_FromUnicode(s, size);
3140 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3143 Py_DECREF(unicode);
3144 return v;
3145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147PyObject *
3148PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003149 const char *encoding,
3150 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003151{
3152 PyObject *v;
3153
3154 if (!PyUnicode_Check(unicode)) {
3155 PyErr_BadArgument();
3156 goto onError;
3157 }
3158
3159 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003160 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003161
3162 /* Encode via the codec registry */
3163 v = PyCodec_Encode(unicode, encoding, errors);
3164 if (v == NULL)
3165 goto onError;
3166 return v;
3167
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003169 return NULL;
3170}
3171
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172static size_t
3173wcstombs_errorpos(const wchar_t *wstr)
3174{
3175 size_t len;
3176#if SIZEOF_WCHAR_T == 2
3177 wchar_t buf[3];
3178#else
3179 wchar_t buf[2];
3180#endif
3181 char outbuf[MB_LEN_MAX];
3182 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003183
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003184#if SIZEOF_WCHAR_T == 2
3185 buf[2] = 0;
3186#else
3187 buf[1] = 0;
3188#endif
3189 start = wstr;
3190 while (*wstr != L'\0')
3191 {
3192 previous = wstr;
3193#if SIZEOF_WCHAR_T == 2
3194 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3195 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3196 {
3197 buf[0] = wstr[0];
3198 buf[1] = wstr[1];
3199 wstr += 2;
3200 }
3201 else {
3202 buf[0] = *wstr;
3203 buf[1] = 0;
3204 wstr++;
3205 }
3206#else
3207 buf[0] = *wstr;
3208 wstr++;
3209#endif
3210 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003211 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003212 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003213 }
3214
3215 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 return 0;
3217}
3218
Victor Stinner1b579672011-12-17 05:47:23 +01003219static int
3220locale_error_handler(const char *errors, int *surrogateescape)
3221{
3222 if (errors == NULL) {
3223 *surrogateescape = 0;
3224 return 0;
3225 }
3226
3227 if (strcmp(errors, "strict") == 0) {
3228 *surrogateescape = 0;
3229 return 0;
3230 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003231 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003232 *surrogateescape = 1;
3233 return 0;
3234 }
3235 PyErr_Format(PyExc_ValueError,
3236 "only 'strict' and 'surrogateescape' error handlers "
3237 "are supported, not '%s'",
3238 errors);
3239 return -1;
3240}
3241
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003242PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003243PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003244{
3245 Py_ssize_t wlen, wlen2;
3246 wchar_t *wstr;
3247 PyObject *bytes = NULL;
3248 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003249 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003250 PyObject *exc;
3251 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003252 int surrogateescape;
3253
3254 if (locale_error_handler(errors, &surrogateescape) < 0)
3255 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003256
3257 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3258 if (wstr == NULL)
3259 return NULL;
3260
3261 wlen2 = wcslen(wstr);
3262 if (wlen2 != wlen) {
3263 PyMem_Free(wstr);
3264 PyErr_SetString(PyExc_TypeError, "embedded null character");
3265 return NULL;
3266 }
3267
3268 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003269 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003270 char *str;
3271
3272 str = _Py_wchar2char(wstr, &error_pos);
3273 if (str == NULL) {
3274 if (error_pos == (size_t)-1) {
3275 PyErr_NoMemory();
3276 PyMem_Free(wstr);
3277 return NULL;
3278 }
3279 else {
3280 goto encode_error;
3281 }
3282 }
3283 PyMem_Free(wstr);
3284
3285 bytes = PyBytes_FromString(str);
3286 PyMem_Free(str);
3287 }
3288 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003289 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290 size_t len, len2;
3291
3292 len = wcstombs(NULL, wstr, 0);
3293 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003294 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003295 goto encode_error;
3296 }
3297
3298 bytes = PyBytes_FromStringAndSize(NULL, len);
3299 if (bytes == NULL) {
3300 PyMem_Free(wstr);
3301 return NULL;
3302 }
3303
3304 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3305 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003306 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003307 goto encode_error;
3308 }
3309 PyMem_Free(wstr);
3310 }
3311 return bytes;
3312
3313encode_error:
3314 errmsg = strerror(errno);
3315 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003316
3317 if (error_pos == (size_t)-1)
3318 error_pos = wcstombs_errorpos(wstr);
3319
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320 PyMem_Free(wstr);
3321 Py_XDECREF(bytes);
3322
Victor Stinner2f197072011-12-17 07:08:30 +01003323 if (errmsg != NULL) {
3324 size_t errlen;
3325 wstr = _Py_char2wchar(errmsg, &errlen);
3326 if (wstr != NULL) {
3327 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003328 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003329 } else
3330 errmsg = NULL;
3331 }
3332 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003333 reason = PyUnicode_FromString(
3334 "wcstombs() encountered an unencodable "
3335 "wide character");
3336 if (reason == NULL)
3337 return NULL;
3338
3339 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3340 "locale", unicode,
3341 (Py_ssize_t)error_pos,
3342 (Py_ssize_t)(error_pos+1),
3343 reason);
3344 Py_DECREF(reason);
3345 if (exc != NULL) {
3346 PyCodec_StrictErrors(exc);
3347 Py_XDECREF(exc);
3348 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003349 return NULL;
3350}
3351
Victor Stinnerad158722010-10-27 00:25:46 +00003352PyObject *
3353PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003354{
Victor Stinner99b95382011-07-04 14:23:54 +02003355#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003356 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003357#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003359#else
Victor Stinner793b5312011-04-27 00:24:21 +02003360 PyInterpreterState *interp = PyThreadState_GET()->interp;
3361 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3362 cannot use it to encode and decode filenames before it is loaded. Load
3363 the Python codec requires to encode at least its own filename. Use the C
3364 version of the locale codec until the codec registry is initialized and
3365 the Python codec is loaded.
3366
3367 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3368 cannot only rely on it: check also interp->fscodec_initialized for
3369 subinterpreters. */
3370 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003371 return PyUnicode_AsEncodedString(unicode,
3372 Py_FileSystemDefaultEncoding,
3373 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003374 }
3375 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003376 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003377 }
Victor Stinnerad158722010-10-27 00:25:46 +00003378#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003379}
3380
Alexander Belopolsky40018472011-02-26 01:02:56 +00003381PyObject *
3382PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003383 const char *encoding,
3384 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385{
3386 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003387 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003388
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 if (!PyUnicode_Check(unicode)) {
3390 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 }
Fred Drakee4315f52000-05-09 19:53:39 +00003393
Fred Drakee4315f52000-05-09 19:53:39 +00003394 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003395 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003396 if ((strcmp(lower, "utf-8") == 0) ||
3397 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003398 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003399 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003401 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003402 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003403 }
Victor Stinner37296e82010-06-10 13:36:23 +00003404 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003405 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003406 (strcmp(lower, "iso-8859-1") == 0) ||
3407 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003409#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003410 else if (strcmp(lower, "mbcs") == 0)
3411 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003412#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003413 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003414 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416
3417 /* Encode via the codec registry */
3418 v = PyCodec_Encode(unicode, encoding, errors);
3419 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003420 return NULL;
3421
3422 /* The normal path */
3423 if (PyBytes_Check(v))
3424 return v;
3425
3426 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003427 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003428 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003429 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003430
3431 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003432 "encoder %s returned bytearray instead of bytes; "
3433 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003434 encoding);
3435 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003436 Py_DECREF(v);
3437 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003439
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3441 Py_DECREF(v);
3442 return b;
3443 }
3444
3445 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003446 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3447 "use codecs.encode() to encode to arbitrary types",
3448 encoding,
3449 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003450 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003451 return NULL;
3452}
3453
Alexander Belopolsky40018472011-02-26 01:02:56 +00003454PyObject *
3455PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003456 const char *encoding,
3457 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003458{
3459 PyObject *v;
3460
3461 if (!PyUnicode_Check(unicode)) {
3462 PyErr_BadArgument();
3463 goto onError;
3464 }
3465
3466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003468
3469 /* Encode via the codec registry */
3470 v = PyCodec_Encode(unicode, encoding, errors);
3471 if (v == NULL)
3472 goto onError;
3473 if (!PyUnicode_Check(v)) {
3474 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003475 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3476 "use codecs.encode() to encode to arbitrary types",
3477 encoding,
3478 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003479 Py_DECREF(v);
3480 goto onError;
3481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003483
Benjamin Peterson29060642009-01-31 22:14:21 +00003484 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485 return NULL;
3486}
3487
Victor Stinner2f197072011-12-17 07:08:30 +01003488static size_t
3489mbstowcs_errorpos(const char *str, size_t len)
3490{
3491#ifdef HAVE_MBRTOWC
3492 const char *start = str;
3493 mbstate_t mbs;
3494 size_t converted;
3495 wchar_t ch;
3496
3497 memset(&mbs, 0, sizeof mbs);
3498 while (len)
3499 {
3500 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3501 if (converted == 0)
3502 /* Reached end of string */
3503 break;
3504 if (converted == (size_t)-1 || converted == (size_t)-2) {
3505 /* Conversion error or incomplete character */
3506 return str - start;
3507 }
3508 else {
3509 str += converted;
3510 len -= converted;
3511 }
3512 }
3513 /* failed to find the undecodable byte sequence */
3514 return 0;
3515#endif
3516 return 0;
3517}
3518
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003519PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003520PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003521 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522{
3523 wchar_t smallbuf[256];
3524 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3525 wchar_t *wstr;
3526 size_t wlen, wlen2;
3527 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003528 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003529 size_t error_pos;
3530 char *errmsg;
3531 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003532
3533 if (locale_error_handler(errors, &surrogateescape) < 0)
3534 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003535
3536 if (str[len] != '\0' || len != strlen(str)) {
3537 PyErr_SetString(PyExc_TypeError, "embedded null character");
3538 return NULL;
3539 }
3540
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003541 if (surrogateescape) {
3542 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003543 wstr = _Py_char2wchar(str, &wlen);
3544 if (wstr == NULL) {
3545 if (wlen == (size_t)-1)
3546 PyErr_NoMemory();
3547 else
3548 PyErr_SetFromErrno(PyExc_OSError);
3549 return NULL;
3550 }
3551
3552 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003553 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003554 }
3555 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003556 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003557#ifndef HAVE_BROKEN_MBSTOWCS
3558 wlen = mbstowcs(NULL, str, 0);
3559#else
3560 wlen = len;
3561#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003562 if (wlen == (size_t)-1)
3563 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003564 if (wlen+1 <= smallbuf_len) {
3565 wstr = smallbuf;
3566 }
3567 else {
3568 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3569 return PyErr_NoMemory();
3570
3571 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3572 if (!wstr)
3573 return PyErr_NoMemory();
3574 }
3575
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003576 wlen2 = mbstowcs(wstr, str, wlen+1);
3577 if (wlen2 == (size_t)-1) {
3578 if (wstr != smallbuf)
3579 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003580 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003581 }
3582#ifdef HAVE_BROKEN_MBSTOWCS
3583 assert(wlen2 == wlen);
3584#endif
3585 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3586 if (wstr != smallbuf)
3587 PyMem_Free(wstr);
3588 }
3589 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003590
3591decode_error:
3592 errmsg = strerror(errno);
3593 assert(errmsg != NULL);
3594
3595 error_pos = mbstowcs_errorpos(str, len);
3596 if (errmsg != NULL) {
3597 size_t errlen;
3598 wstr = _Py_char2wchar(errmsg, &errlen);
3599 if (wstr != NULL) {
3600 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003601 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003602 } else
3603 errmsg = NULL;
3604 }
3605 if (errmsg == NULL)
3606 reason = PyUnicode_FromString(
3607 "mbstowcs() encountered an invalid multibyte sequence");
3608 if (reason == NULL)
3609 return NULL;
3610
3611 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3612 "locale", str, len,
3613 (Py_ssize_t)error_pos,
3614 (Py_ssize_t)(error_pos+1),
3615 reason);
3616 Py_DECREF(reason);
3617 if (exc != NULL) {
3618 PyCodec_StrictErrors(exc);
3619 Py_XDECREF(exc);
3620 }
3621 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003622}
3623
3624PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003625PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003626{
3627 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003628 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003629}
3630
3631
3632PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003633PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003634 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003635 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3636}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003637
Christian Heimes5894ba72007-11-04 11:43:14 +00003638PyObject*
3639PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3640{
Victor Stinner99b95382011-07-04 14:23:54 +02003641#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003642 return PyUnicode_DecodeMBCS(s, size, NULL);
3643#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003644 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003645#else
Victor Stinner793b5312011-04-27 00:24:21 +02003646 PyInterpreterState *interp = PyThreadState_GET()->interp;
3647 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3648 cannot use it to encode and decode filenames before it is loaded. Load
3649 the Python codec requires to encode at least its own filename. Use the C
3650 version of the locale codec until the codec registry is initialized and
3651 the Python codec is loaded.
3652
3653 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3654 cannot only rely on it: check also interp->fscodec_initialized for
3655 subinterpreters. */
3656 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003657 return PyUnicode_Decode(s, size,
3658 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003659 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003660 }
3661 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003662 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003663 }
Victor Stinnerad158722010-10-27 00:25:46 +00003664#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003665}
3666
Martin v. Löwis011e8422009-05-05 04:43:17 +00003667
3668int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003669_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003670{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003671 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003672
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003673 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003674 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003675 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3676 PyUnicode_GET_LENGTH(str), '\0', 1);
3677 if (pos == -1)
3678 return 0;
3679 else
3680 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003681}
3682
Antoine Pitrou13348842012-01-29 18:36:34 +01003683int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003684PyUnicode_FSConverter(PyObject* arg, void* addr)
3685{
3686 PyObject *output = NULL;
3687 Py_ssize_t size;
3688 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003689 if (arg == NULL) {
3690 Py_DECREF(*(PyObject**)addr);
3691 return 1;
3692 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003693 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003694 output = arg;
3695 Py_INCREF(output);
3696 }
3697 else {
3698 arg = PyUnicode_FromObject(arg);
3699 if (!arg)
3700 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003701 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003702 Py_DECREF(arg);
3703 if (!output)
3704 return 0;
3705 if (!PyBytes_Check(output)) {
3706 Py_DECREF(output);
3707 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3708 return 0;
3709 }
3710 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003711 size = PyBytes_GET_SIZE(output);
3712 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003713 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003714 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003715 Py_DECREF(output);
3716 return 0;
3717 }
3718 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003719 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003720}
3721
3722
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003723int
3724PyUnicode_FSDecoder(PyObject* arg, void* addr)
3725{
3726 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003727 if (arg == NULL) {
3728 Py_DECREF(*(PyObject**)addr);
3729 return 1;
3730 }
3731 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003732 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003734 output = arg;
3735 Py_INCREF(output);
3736 }
3737 else {
3738 arg = PyBytes_FromObject(arg);
3739 if (!arg)
3740 return 0;
3741 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3742 PyBytes_GET_SIZE(arg));
3743 Py_DECREF(arg);
3744 if (!output)
3745 return 0;
3746 if (!PyUnicode_Check(output)) {
3747 Py_DECREF(output);
3748 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3749 return 0;
3750 }
3751 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003752 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003753 Py_DECREF(output);
3754 return 0;
3755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003757 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003758 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3759 Py_DECREF(output);
3760 return 0;
3761 }
3762 *(PyObject**)addr = output;
3763 return Py_CLEANUP_SUPPORTED;
3764}
3765
3766
Martin v. Löwis5b222132007-06-10 09:51:05 +00003767char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003769{
Christian Heimesf3863112007-11-22 07:46:41 +00003770 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003772 if (!PyUnicode_Check(unicode)) {
3773 PyErr_BadArgument();
3774 return NULL;
3775 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003777 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003779 if (PyUnicode_UTF8(unicode) == NULL) {
3780 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3782 if (bytes == NULL)
3783 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3785 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003786 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 Py_DECREF(bytes);
3788 return NULL;
3789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003790 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3791 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3792 PyBytes_AS_STRING(bytes),
3793 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794 Py_DECREF(bytes);
3795 }
3796
3797 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003798 *psize = PyUnicode_UTF8_LENGTH(unicode);
3799 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003800}
3801
3802char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3806}
3807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003808Py_UNICODE *
3809PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 const unsigned char *one_byte;
3812#if SIZEOF_WCHAR_T == 4
3813 const Py_UCS2 *two_bytes;
3814#else
3815 const Py_UCS4 *four_bytes;
3816 const Py_UCS4 *ucs4_end;
3817 Py_ssize_t num_surrogates;
3818#endif
3819 wchar_t *w;
3820 wchar_t *wchar_end;
3821
3822 if (!PyUnicode_Check(unicode)) {
3823 PyErr_BadArgument();
3824 return NULL;
3825 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003828 assert(_PyUnicode_KIND(unicode) != 0);
3829 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003831 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3834 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 num_surrogates = 0;
3836
3837 for (; four_bytes < ucs4_end; ++four_bytes) {
3838 if (*four_bytes > 0xFFFF)
3839 ++num_surrogates;
3840 }
3841
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003842 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3843 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3844 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 PyErr_NoMemory();
3846 return NULL;
3847 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003848 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003850 w = _PyUnicode_WSTR(unicode);
3851 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3852 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3854 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003855 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003857 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3858 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 }
3860 else
3861 *w = *four_bytes;
3862
3863 if (w > wchar_end) {
3864 assert(0 && "Miscalculated string end");
3865 }
3866 }
3867 *w = 0;
3868#else
3869 /* sizeof(wchar_t) == 4 */
3870 Py_FatalError("Impossible unicode object state, wstr and str "
3871 "should share memory already.");
3872 return NULL;
3873#endif
3874 }
3875 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003876 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3877 (_PyUnicode_LENGTH(unicode) + 1));
3878 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 PyErr_NoMemory();
3880 return NULL;
3881 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3883 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3884 w = _PyUnicode_WSTR(unicode);
3885 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3888 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 for (; w < wchar_end; ++one_byte, ++w)
3890 *w = *one_byte;
3891 /* null-terminate the wstr */
3892 *w = 0;
3893 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003896 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 for (; w < wchar_end; ++two_bytes, ++w)
3898 *w = *two_bytes;
3899 /* null-terminate the wstr */
3900 *w = 0;
3901#else
3902 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003903 PyObject_FREE(_PyUnicode_WSTR(unicode));
3904 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 Py_FatalError("Impossible unicode object state, wstr "
3906 "and str should share memory already.");
3907 return NULL;
3908#endif
3909 }
3910 else {
3911 assert(0 && "This should never happen.");
3912 }
3913 }
3914 }
3915 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003916 *size = PyUnicode_WSTR_LENGTH(unicode);
3917 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003918}
3919
Alexander Belopolsky40018472011-02-26 01:02:56 +00003920Py_UNICODE *
3921PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924}
3925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926
Alexander Belopolsky40018472011-02-26 01:02:56 +00003927Py_ssize_t
3928PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929{
3930 if (!PyUnicode_Check(unicode)) {
3931 PyErr_BadArgument();
3932 goto onError;
3933 }
3934 return PyUnicode_GET_SIZE(unicode);
3935
Benjamin Peterson29060642009-01-31 22:14:21 +00003936 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 return -1;
3938}
3939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940Py_ssize_t
3941PyUnicode_GetLength(PyObject *unicode)
3942{
Victor Stinner07621332012-06-16 04:53:46 +02003943 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 PyErr_BadArgument();
3945 return -1;
3946 }
Victor Stinner07621332012-06-16 04:53:46 +02003947 if (PyUnicode_READY(unicode) == -1)
3948 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 return PyUnicode_GET_LENGTH(unicode);
3950}
3951
3952Py_UCS4
3953PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3954{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003955 void *data;
3956 int kind;
3957
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003958 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3959 PyErr_BadArgument();
3960 return (Py_UCS4)-1;
3961 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003962 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003963 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 return (Py_UCS4)-1;
3965 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003966 data = PyUnicode_DATA(unicode);
3967 kind = PyUnicode_KIND(unicode);
3968 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969}
3970
3971int
3972PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3973{
3974 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003975 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976 return -1;
3977 }
Victor Stinner488fa492011-12-12 00:01:39 +01003978 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003979 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003980 PyErr_SetString(PyExc_IndexError, "string index out of range");
3981 return -1;
3982 }
Victor Stinner488fa492011-12-12 00:01:39 +01003983 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003984 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003985 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3986 PyErr_SetString(PyExc_ValueError, "character out of range");
3987 return -1;
3988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3990 index, ch);
3991 return 0;
3992}
3993
Alexander Belopolsky40018472011-02-26 01:02:56 +00003994const char *
3995PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003996{
Victor Stinner42cb4622010-09-01 19:39:01 +00003997 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003998}
3999
Victor Stinner554f3f02010-06-16 23:33:54 +00004000/* create or adjust a UnicodeDecodeError */
4001static void
4002make_decode_exception(PyObject **exceptionObject,
4003 const char *encoding,
4004 const char *input, Py_ssize_t length,
4005 Py_ssize_t startpos, Py_ssize_t endpos,
4006 const char *reason)
4007{
4008 if (*exceptionObject == NULL) {
4009 *exceptionObject = PyUnicodeDecodeError_Create(
4010 encoding, input, length, startpos, endpos, reason);
4011 }
4012 else {
4013 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4014 goto onError;
4015 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4016 goto onError;
4017 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4018 goto onError;
4019 }
4020 return;
4021
4022onError:
4023 Py_DECREF(*exceptionObject);
4024 *exceptionObject = NULL;
4025}
4026
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004027#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028/* error handling callback helper:
4029 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004030 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004031 and adjust various state variables.
4032 return 0 on success, -1 on error
4033*/
4034
Alexander Belopolsky40018472011-02-26 01:02:56 +00004035static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004036unicode_decode_call_errorhandler_wchar(
4037 const char *errors, PyObject **errorHandler,
4038 const char *encoding, const char *reason,
4039 const char **input, const char **inend, Py_ssize_t *startinpos,
4040 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4041 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004043 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044
4045 PyObject *restuple = NULL;
4046 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004047 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004048 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004049 Py_ssize_t requiredsize;
4050 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004051 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004052 wchar_t *repwstr;
4053 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004055 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4056 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 *errorHandler = PyCodec_LookupError(errors);
4060 if (*errorHandler == NULL)
4061 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 }
4063
Victor Stinner554f3f02010-06-16 23:33:54 +00004064 make_decode_exception(exceptionObject,
4065 encoding,
4066 *input, *inend - *input,
4067 *startinpos, *endinpos,
4068 reason);
4069 if (*exceptionObject == NULL)
4070 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071
4072 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4073 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004076 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 }
4079 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004081
4082 /* Copy back the bytes variables, which might have been modified by the
4083 callback */
4084 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4085 if (!inputobj)
4086 goto onError;
4087 if (!PyBytes_Check(inputobj)) {
4088 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4089 }
4090 *input = PyBytes_AS_STRING(inputobj);
4091 insize = PyBytes_GET_SIZE(inputobj);
4092 *inend = *input + insize;
4093 /* we can DECREF safely, as the exception has another reference,
4094 so the object won't go away. */
4095 Py_DECREF(inputobj);
4096
4097 if (newpos<0)
4098 newpos = insize+newpos;
4099 if (newpos<0 || newpos>insize) {
4100 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4101 goto onError;
4102 }
4103
4104 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4105 if (repwstr == NULL)
4106 goto onError;
4107 /* need more space? (at least enough for what we
4108 have+the replacement+the rest of the string (starting
4109 at the new input position), so we won't have to check space
4110 when there are no errors in the rest of the string) */
4111 requiredsize = *outpos + repwlen + insize-newpos;
4112 if (requiredsize > outsize) {
4113 if (requiredsize < 2*outsize)
4114 requiredsize = 2*outsize;
4115 if (unicode_resize(output, requiredsize) < 0)
4116 goto onError;
4117 }
4118 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4119 *outpos += repwlen;
4120
4121 *endinpos = newpos;
4122 *inptr = *input + newpos;
4123
4124 /* we made it! */
4125 Py_XDECREF(restuple);
4126 return 0;
4127
4128 onError:
4129 Py_XDECREF(restuple);
4130 return -1;
4131}
4132#endif /* HAVE_MBCS */
4133
4134static int
4135unicode_decode_call_errorhandler_writer(
4136 const char *errors, PyObject **errorHandler,
4137 const char *encoding, const char *reason,
4138 const char **input, const char **inend, Py_ssize_t *startinpos,
4139 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4140 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4141{
4142 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4143
4144 PyObject *restuple = NULL;
4145 PyObject *repunicode = NULL;
4146 Py_ssize_t insize;
4147 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004148 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004149 PyObject *inputobj = NULL;
4150
4151 if (*errorHandler == NULL) {
4152 *errorHandler = PyCodec_LookupError(errors);
4153 if (*errorHandler == NULL)
4154 goto onError;
4155 }
4156
4157 make_decode_exception(exceptionObject,
4158 encoding,
4159 *input, *inend - *input,
4160 *startinpos, *endinpos,
4161 reason);
4162 if (*exceptionObject == NULL)
4163 goto onError;
4164
4165 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4166 if (restuple == NULL)
4167 goto onError;
4168 if (!PyTuple_Check(restuple)) {
4169 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4170 goto onError;
4171 }
4172 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004173 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004174
4175 /* Copy back the bytes variables, which might have been modified by the
4176 callback */
4177 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4178 if (!inputobj)
4179 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004180 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004182 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004183 *input = PyBytes_AS_STRING(inputobj);
4184 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004185 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004186 /* we can DECREF safely, as the exception has another reference,
4187 so the object won't go away. */
4188 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004189
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004192 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004193 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4194 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004195 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196
Victor Stinner8f674cc2013-04-17 23:02:17 +02004197 if (PyUnicode_READY(repunicode) < 0)
4198 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004199 replen = PyUnicode_GET_LENGTH(repunicode);
4200 writer->min_length += replen;
4201 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004202 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004203 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004204 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004205
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004207 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004208
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004210 Py_XDECREF(restuple);
4211 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004215 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216}
4217
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004218/* --- UTF-7 Codec -------------------------------------------------------- */
4219
Antoine Pitrou244651a2009-05-04 18:56:13 +00004220/* See RFC2152 for details. We encode conservatively and decode liberally. */
4221
4222/* Three simple macros defining base-64. */
4223
4224/* Is c a base-64 character? */
4225
4226#define IS_BASE64(c) \
4227 (((c) >= 'A' && (c) <= 'Z') || \
4228 ((c) >= 'a' && (c) <= 'z') || \
4229 ((c) >= '0' && (c) <= '9') || \
4230 (c) == '+' || (c) == '/')
4231
4232/* given that c is a base-64 character, what is its base-64 value? */
4233
4234#define FROM_BASE64(c) \
4235 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4236 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4237 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4238 (c) == '+' ? 62 : 63)
4239
4240/* What is the base-64 character of the bottom 6 bits of n? */
4241
4242#define TO_BASE64(n) \
4243 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4244
4245/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4246 * decoded as itself. We are permissive on decoding; the only ASCII
4247 * byte not decoding to itself is the + which begins a base64
4248 * string. */
4249
4250#define DECODE_DIRECT(c) \
4251 ((c) <= 127 && (c) != '+')
4252
4253/* The UTF-7 encoder treats ASCII characters differently according to
4254 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4255 * the above). See RFC2152. This array identifies these different
4256 * sets:
4257 * 0 : "Set D"
4258 * alphanumeric and '(),-./:?
4259 * 1 : "Set O"
4260 * !"#$%&*;<=>@[]^_`{|}
4261 * 2 : "whitespace"
4262 * ht nl cr sp
4263 * 3 : special (must be base64 encoded)
4264 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4265 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004266
Tim Petersced69f82003-09-16 20:30:58 +00004267static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004268char utf7_category[128] = {
4269/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4270 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4271/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4272 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4273/* sp ! " # $ % & ' ( ) * + , - . / */
4274 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4275/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4276 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4277/* @ A B C D E F G H I J K L M N O */
4278 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4279/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4280 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4281/* ` a b c d e f g h i j k l m n o */
4282 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4283/* p q r s t u v w x y z { | } ~ del */
4284 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285};
4286
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287/* ENCODE_DIRECT: this character should be encoded as itself. The
4288 * answer depends on whether we are encoding set O as itself, and also
4289 * on whether we are encoding whitespace as itself. RFC2152 makes it
4290 * clear that the answers to these questions vary between
4291 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004292
Antoine Pitrou244651a2009-05-04 18:56:13 +00004293#define ENCODE_DIRECT(c, directO, directWS) \
4294 ((c) < 128 && (c) > 0 && \
4295 ((utf7_category[(c)] == 0) || \
4296 (directWS && (utf7_category[(c)] == 2)) || \
4297 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004298
Alexander Belopolsky40018472011-02-26 01:02:56 +00004299PyObject *
4300PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004301 Py_ssize_t size,
4302 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004303{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004304 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4305}
4306
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307/* The decoder. The only state we preserve is our read position,
4308 * i.e. how many characters we have consumed. So if we end in the
4309 * middle of a shift sequence we have to back off the read position
4310 * and the output to the beginning of the sequence, otherwise we lose
4311 * all the shift state (seen bits, number of bits seen, high
4312 * surrogate). */
4313
Alexander Belopolsky40018472011-02-26 01:02:56 +00004314PyObject *
4315PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004316 Py_ssize_t size,
4317 const char *errors,
4318 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004319{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004321 Py_ssize_t startinpos;
4322 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004324 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325 const char *errmsg = "";
4326 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 unsigned int base64bits = 0;
4329 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004330 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 PyObject *errorHandler = NULL;
4332 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004334 if (size == 0) {
4335 if (consumed)
4336 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004337 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004338 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004339
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004340 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004341 _PyUnicodeWriter_Init(&writer);
4342 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343
4344 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 e = s + size;
4346
4347 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004348 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004350 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 if (inShift) { /* in a base-64 section */
4353 if (IS_BASE64(ch)) { /* consume a base-64 character */
4354 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4355 base64bits += 6;
4356 s++;
4357 if (base64bits >= 16) {
4358 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004359 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 base64bits -= 16;
4361 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004362 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 if (surrogate) {
4364 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004365 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4366 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004367 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004368 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004370 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 }
4372 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004373 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004374 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 }
4377 }
Victor Stinner551ac952011-11-29 22:58:13 +01004378 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 /* first surrogate */
4380 surrogate = outCh;
4381 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004383 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004384 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 }
4386 }
4387 }
4388 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 inShift = 0;
4390 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004392 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004393 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004394 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 if (base64bits > 0) { /* left-over bits */
4397 if (base64bits >= 6) {
4398 /* We've seen at least one base-64 character */
4399 errmsg = "partial character in shift sequence";
4400 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 else {
4403 /* Some bits remain; they should be zero */
4404 if (base64buffer != 0) {
4405 errmsg = "non-zero padding bits in shift sequence";
4406 goto utf7Error;
4407 }
4408 }
4409 }
4410 if (ch != '-') {
4411 /* '-' is absorbed; other terminating
4412 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004413 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
4417 }
4418 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 s++; /* consume '+' */
4421 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004423 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004424 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425 }
4426 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004428 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004430 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 }
4432 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004435 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 else {
4439 startinpos = s-starts;
4440 s++;
4441 errmsg = "unexpected special character";
4442 goto utf7Error;
4443 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004447 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 errors, &errorHandler,
4449 "utf7", errmsg,
4450 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004451 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453 }
4454
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 /* end of string */
4456
4457 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4458 /* if we're in an inconsistent state, that's an error */
4459 if (surrogate ||
4460 (base64bits >= 6) ||
4461 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 errors, &errorHandler,
4465 "utf7", "unterminated shift sequence",
4466 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004467 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 goto onError;
4469 if (s < e)
4470 goto restart;
4471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473
4474 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004475 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004478 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 }
4480 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004481 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004482 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004483 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 Py_XDECREF(errorHandler);
4486 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004487 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 Py_XDECREF(errorHandler);
4491 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004492 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493 return NULL;
4494}
4495
4496
Alexander Belopolsky40018472011-02-26 01:02:56 +00004497PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004498_PyUnicode_EncodeUTF7(PyObject *str,
4499 int base64SetO,
4500 int base64WhiteSpace,
4501 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503 int kind;
4504 void *data;
4505 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004506 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004508 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509 unsigned int base64bits = 0;
4510 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511 char * out;
4512 char * start;
4513
Benjamin Petersonbac79492012-01-14 13:34:47 -05004514 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004515 return NULL;
4516 kind = PyUnicode_KIND(str);
4517 data = PyUnicode_DATA(str);
4518 len = PyUnicode_GET_LENGTH(str);
4519
4520 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004523 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004524 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004525 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004526 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 if (v == NULL)
4528 return NULL;
4529
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004530 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004531 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 if (inShift) {
4535 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4536 /* shifting out */
4537 if (base64bits) { /* output remaining bits */
4538 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4539 base64buffer = 0;
4540 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
4542 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 /* Characters not in the BASE64 set implicitly unshift the sequence
4544 so no '-' is required, except if the character is itself a '-' */
4545 if (IS_BASE64(ch) || ch == '-') {
4546 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 *out++ = (char) ch;
4549 }
4550 else {
4551 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004552 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004554 else { /* not in a shift sequence */
4555 if (ch == '+') {
4556 *out++ = '+';
4557 *out++ = '-';
4558 }
4559 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4560 *out++ = (char) ch;
4561 }
4562 else {
4563 *out++ = '+';
4564 inShift = 1;
4565 goto encode_char;
4566 }
4567 }
4568 continue;
4569encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004571 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004572
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 /* code first surrogate */
4574 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004575 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 while (base64bits >= 6) {
4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4578 base64bits -= 6;
4579 }
4580 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004581 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 base64bits += 16;
4584 base64buffer = (base64buffer << 16) | ch;
4585 while (base64bits >= 6) {
4586 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4587 base64bits -= 6;
4588 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004589 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 if (base64bits)
4591 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4592 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004593 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004594 if (_PyBytes_Resize(&v, out - start) < 0)
4595 return NULL;
4596 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004597}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004598PyObject *
4599PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4600 Py_ssize_t size,
4601 int base64SetO,
4602 int base64WhiteSpace,
4603 const char *errors)
4604{
4605 PyObject *result;
4606 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4607 if (tmp == NULL)
4608 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004609 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004610 base64WhiteSpace, errors);
4611 Py_DECREF(tmp);
4612 return result;
4613}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615#undef IS_BASE64
4616#undef FROM_BASE64
4617#undef TO_BASE64
4618#undef DECODE_DIRECT
4619#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621/* --- UTF-8 Codec -------------------------------------------------------- */
4622
Alexander Belopolsky40018472011-02-26 01:02:56 +00004623PyObject *
4624PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004625 Py_ssize_t size,
4626 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627{
Walter Dörwald69652032004-09-07 20:24:22 +00004628 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4629}
4630
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004631#include "stringlib/asciilib.h"
4632#include "stringlib/codecs.h"
4633#include "stringlib/undef.h"
4634
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004635#include "stringlib/ucs1lib.h"
4636#include "stringlib/codecs.h"
4637#include "stringlib/undef.h"
4638
4639#include "stringlib/ucs2lib.h"
4640#include "stringlib/codecs.h"
4641#include "stringlib/undef.h"
4642
4643#include "stringlib/ucs4lib.h"
4644#include "stringlib/codecs.h"
4645#include "stringlib/undef.h"
4646
Antoine Pitrouab868312009-01-10 15:40:25 +00004647/* Mask to quickly check whether a C 'long' contains a
4648 non-ASCII, UTF8-encoded char. */
4649#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004650# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004651#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004652# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004653#else
4654# error C 'long' size should be either 4 or 8!
4655#endif
4656
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004657static Py_ssize_t
4658ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004659{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004660 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004661 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004662
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004663 /*
4664 * Issue #17237: m68k is a bit different from most architectures in
4665 * that objects do not use "natural alignment" - for example, int and
4666 * long are only aligned at 2-byte boundaries. Therefore the assert()
4667 * won't work; also, tests have shown that skipping the "optimised
4668 * version" will even speed up m68k.
4669 */
4670#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004672 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4673 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 /* Fast path, see in STRINGLIB(utf8_decode) for
4675 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004676 /* Help allocation */
4677 const char *_p = p;
4678 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004679 while (_p < aligned_end) {
4680 unsigned long value = *(const unsigned long *) _p;
4681 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683 *((unsigned long *)q) = value;
4684 _p += SIZEOF_LONG;
4685 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004686 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004687 p = _p;
4688 while (p < end) {
4689 if ((unsigned char)*p & 0x80)
4690 break;
4691 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004696#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004697 while (p < end) {
4698 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4699 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004700 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004701 /* Help allocation */
4702 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004703 while (_p < aligned_end) {
4704 unsigned long value = *(unsigned long *) _p;
4705 if (value & ASCII_CHAR_MASK)
4706 break;
4707 _p += SIZEOF_LONG;
4708 }
4709 p = _p;
4710 if (_p == end)
4711 break;
4712 }
4713 if ((unsigned char)*p & 0x80)
4714 break;
4715 ++p;
4716 }
4717 memcpy(dest, start, p - start);
4718 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719}
Antoine Pitrouab868312009-01-10 15:40:25 +00004720
Victor Stinner785938e2011-12-11 20:09:03 +01004721PyObject *
4722PyUnicode_DecodeUTF8Stateful(const char *s,
4723 Py_ssize_t size,
4724 const char *errors,
4725 Py_ssize_t *consumed)
4726{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004727 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004728 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730
4731 Py_ssize_t startinpos;
4732 Py_ssize_t endinpos;
4733 const char *errmsg = "";
4734 PyObject *errorHandler = NULL;
4735 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004736
4737 if (size == 0) {
4738 if (consumed)
4739 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004740 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004741 }
4742
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4744 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004745 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004746 *consumed = 1;
4747 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004748 }
4749
Victor Stinner8f674cc2013-04-17 23:02:17 +02004750 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004751 writer.min_length = size;
4752 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004753 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004754
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 writer.pos = ascii_decode(s, end, writer.data);
4756 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 while (s < end) {
4758 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004759 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004761 if (PyUnicode_IS_ASCII(writer.buffer))
4762 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004763 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004764 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004766 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004767 } else {
4768 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004769 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 }
4771
4772 switch (ch) {
4773 case 0:
4774 if (s == end || consumed)
4775 goto End;
4776 errmsg = "unexpected end of data";
4777 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004778 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 break;
4780 case 1:
4781 errmsg = "invalid start byte";
4782 startinpos = s - starts;
4783 endinpos = startinpos + 1;
4784 break;
4785 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004786 case 3:
4787 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 errmsg = "invalid continuation byte";
4789 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004790 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004791 break;
4792 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004793 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 goto onError;
4795 continue;
4796 }
4797
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004798 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004799 errors, &errorHandler,
4800 "utf-8", errmsg,
4801 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004802 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004804 }
4805
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004806End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 if (consumed)
4808 *consumed = s - starts;
4809
4810 Py_XDECREF(errorHandler);
4811 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004812 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004813
4814onError:
4815 Py_XDECREF(errorHandler);
4816 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004817 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004819}
4820
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821#ifdef __APPLE__
4822
4823/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004824 used to decode the command line arguments on Mac OS X.
4825
4826 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004827 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004828
4829wchar_t*
4830_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4831{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 wchar_t *unicode;
4834 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835
4836 /* Note: size will always be longer than the resulting Unicode
4837 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004838 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004839 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004840 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841 if (!unicode)
4842 return NULL;
4843
4844 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004845 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004846 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004847 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004848 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004849#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004851#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004853#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004854 if (ch > 0xFF) {
4855#if SIZEOF_WCHAR_T == 4
4856 assert(0);
4857#else
4858 assert(Py_UNICODE_IS_SURROGATE(ch));
4859 /* compute and append the two surrogates: */
4860 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4861 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4862#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 else {
4865 if (!ch && s == e)
4866 break;
4867 /* surrogateescape */
4868 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4869 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004870 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004872 return unicode;
4873}
4874
4875#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877/* Primary internal function which creates utf8 encoded bytes objects.
4878
4879 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004880 and allocate exactly as much space needed at the end. Else allocate the
4881 maximum possible needed (4 result bytes per Unicode character), and return
4882 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004883*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004884PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004885_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886{
Victor Stinner6099a032011-12-18 14:22:26 +01004887 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004888 void *data;
4889 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891 if (!PyUnicode_Check(unicode)) {
4892 PyErr_BadArgument();
4893 return NULL;
4894 }
4895
4896 if (PyUnicode_READY(unicode) == -1)
4897 return NULL;
4898
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004899 if (PyUnicode_UTF8(unicode))
4900 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4901 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902
4903 kind = PyUnicode_KIND(unicode);
4904 data = PyUnicode_DATA(unicode);
4905 size = PyUnicode_GET_LENGTH(unicode);
4906
Benjamin Petersonead6b532011-12-20 17:23:42 -06004907 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004908 default:
4909 assert(0);
4910 case PyUnicode_1BYTE_KIND:
4911 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4912 assert(!PyUnicode_IS_ASCII(unicode));
4913 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4914 case PyUnicode_2BYTE_KIND:
4915 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4916 case PyUnicode_4BYTE_KIND:
4917 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919}
4920
Alexander Belopolsky40018472011-02-26 01:02:56 +00004921PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004922PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4923 Py_ssize_t size,
4924 const char *errors)
4925{
4926 PyObject *v, *unicode;
4927
4928 unicode = PyUnicode_FromUnicode(s, size);
4929 if (unicode == NULL)
4930 return NULL;
4931 v = _PyUnicode_AsUTF8String(unicode, errors);
4932 Py_DECREF(unicode);
4933 return v;
4934}
4935
4936PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004937PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004939 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940}
4941
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942/* --- UTF-32 Codec ------------------------------------------------------- */
4943
4944PyObject *
4945PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors,
4948 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949{
4950 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4951}
4952
4953PyObject *
4954PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 Py_ssize_t size,
4956 const char *errors,
4957 int *byteorder,
4958 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959{
4960 const char *starts = s;
4961 Py_ssize_t startinpos;
4962 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004963 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004964 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004965 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004966 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967 PyObject *errorHandler = NULL;
4968 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004969
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970 q = (unsigned char *)s;
4971 e = q + size;
4972
4973 if (byteorder)
4974 bo = *byteorder;
4975
4976 /* Check for BOM marks (U+FEFF) in the input and adjust current
4977 byte order setting accordingly. In native mode, the leading BOM
4978 mark is skipped, in all other modes, it is copied to the output
4979 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004980 if (bo == 0 && size >= 4) {
4981 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4982 if (bom == 0x0000FEFF) {
4983 bo = -1;
4984 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 else if (bom == 0xFFFE0000) {
4987 bo = 1;
4988 q += 4;
4989 }
4990 if (byteorder)
4991 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004992 }
4993
Victor Stinnere64322e2012-10-30 23:12:47 +01004994 if (q == e) {
4995 if (consumed)
4996 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004997 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004998 }
4999
Victor Stinnere64322e2012-10-30 23:12:47 +01005000#ifdef WORDS_BIGENDIAN
5001 le = bo < 0;
5002#else
5003 le = bo <= 0;
5004#endif
5005
Victor Stinner8f674cc2013-04-17 23:02:17 +02005006 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005007 writer.min_length = (e - q + 3) / 4;
5008 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005010
Victor Stinnere64322e2012-10-30 23:12:47 +01005011 while (1) {
5012 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005013 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005014
Victor Stinnere64322e2012-10-30 23:12:47 +01005015 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005016 enum PyUnicode_Kind kind = writer.kind;
5017 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005018 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005019 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005020 if (le) {
5021 do {
5022 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5023 if (ch > maxch)
5024 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005025 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005026 q += 4;
5027 } while (q <= last);
5028 }
5029 else {
5030 do {
5031 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5032 if (ch > maxch)
5033 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005034 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005035 q += 4;
5036 } while (q <= last);
5037 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005038 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005039 }
5040
5041 if (ch <= maxch) {
5042 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005044 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005046 startinpos = ((const char *)q) - starts;
5047 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005049 else {
5050 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005051 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005052 goto onError;
5053 q += 4;
5054 continue;
5055 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005057 startinpos = ((const char *)q) - starts;
5058 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005060
5061 /* The remaining input chars are ignored if the callback
5062 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005063 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 errors, &errorHandler,
5065 "utf32", errmsg,
5066 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005067 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069 }
5070
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073
Walter Dörwald41980ca2007-08-16 21:55:45 +00005074 Py_XDECREF(errorHandler);
5075 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005076 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005079 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 Py_XDECREF(errorHandler);
5081 Py_XDECREF(exc);
5082 return NULL;
5083}
5084
5085PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005086_PyUnicode_EncodeUTF32(PyObject *str,
5087 const char *errors,
5088 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005090 int kind;
5091 void *data;
5092 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005093 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005095 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005097#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 int iorder[] = {0, 1, 2, 3};
5099#else
5100 int iorder[] = {3, 2, 1, 0};
5101#endif
5102
Benjamin Peterson29060642009-01-31 22:14:21 +00005103#define STORECHAR(CH) \
5104 do { \
5105 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5106 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5107 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5108 p[iorder[0]] = (CH) & 0xff; \
5109 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 } while(0)
5111
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005112 if (!PyUnicode_Check(str)) {
5113 PyErr_BadArgument();
5114 return NULL;
5115 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005116 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005117 return NULL;
5118 kind = PyUnicode_KIND(str);
5119 data = PyUnicode_DATA(str);
5120 len = PyUnicode_GET_LENGTH(str);
5121
5122 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005123 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005125 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005126 if (v == NULL)
5127 return NULL;
5128
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005129 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005132 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005133 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134
5135 if (byteorder == -1) {
5136 /* force LE */
5137 iorder[0] = 0;
5138 iorder[1] = 1;
5139 iorder[2] = 2;
5140 iorder[3] = 3;
5141 }
5142 else if (byteorder == 1) {
5143 /* force BE */
5144 iorder[0] = 3;
5145 iorder[1] = 2;
5146 iorder[2] = 1;
5147 iorder[3] = 0;
5148 }
5149
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005150 for (i = 0; i < len; i++)
5151 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005152
5153 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005154 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005155#undef STORECHAR
5156}
5157
Alexander Belopolsky40018472011-02-26 01:02:56 +00005158PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005159PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5160 Py_ssize_t size,
5161 const char *errors,
5162 int byteorder)
5163{
5164 PyObject *result;
5165 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5166 if (tmp == NULL)
5167 return NULL;
5168 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5169 Py_DECREF(tmp);
5170 return result;
5171}
5172
5173PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005174PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005175{
Victor Stinnerb960b342011-11-20 19:12:52 +01005176 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005177}
5178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179/* --- UTF-16 Codec ------------------------------------------------------- */
5180
Tim Peters772747b2001-08-09 22:21:55 +00005181PyObject *
5182PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 Py_ssize_t size,
5184 const char *errors,
5185 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186{
Walter Dörwald69652032004-09-07 20:24:22 +00005187 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5188}
5189
5190PyObject *
5191PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 Py_ssize_t size,
5193 const char *errors,
5194 int *byteorder,
5195 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005196{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005198 Py_ssize_t startinpos;
5199 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005200 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005201 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005202 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005203 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005204 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 PyObject *errorHandler = NULL;
5206 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207
Tim Peters772747b2001-08-09 22:21:55 +00005208 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005209 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210
5211 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005212 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005214 /* Check for BOM marks (U+FEFF) in the input and adjust current
5215 byte order setting accordingly. In native mode, the leading BOM
5216 mark is skipped, in all other modes, it is copied to the output
5217 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005218 if (bo == 0 && size >= 2) {
5219 const Py_UCS4 bom = (q[1] << 8) | q[0];
5220 if (bom == 0xFEFF) {
5221 q += 2;
5222 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005224 else if (bom == 0xFFFE) {
5225 q += 2;
5226 bo = 1;
5227 }
5228 if (byteorder)
5229 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
Antoine Pitrou63065d72012-05-15 23:48:04 +02005232 if (q == e) {
5233 if (consumed)
5234 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005235 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005236 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005237
Christian Heimes743e0cd2012-10-17 23:52:17 +02005238#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005239 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005240#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005241 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005242#endif
Tim Peters772747b2001-08-09 22:21:55 +00005243
Antoine Pitrou63065d72012-05-15 23:48:04 +02005244 /* Note: size will always be longer than the resulting Unicode
5245 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005246 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005247 writer.min_length = (e - q + 1) / 2;
5248 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005249 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005250
Antoine Pitrou63065d72012-05-15 23:48:04 +02005251 while (1) {
5252 Py_UCS4 ch = 0;
5253 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005254 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005255 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005256 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005257 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005258 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005259 native_ordering);
5260 else
5261 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005262 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005263 native_ordering);
5264 } else if (kind == PyUnicode_2BYTE_KIND) {
5265 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005266 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005267 native_ordering);
5268 } else {
5269 assert(kind == PyUnicode_4BYTE_KIND);
5270 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005271 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005272 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005273 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005274 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005275
Antoine Pitrou63065d72012-05-15 23:48:04 +02005276 switch (ch)
5277 {
5278 case 0:
5279 /* remaining byte at the end? (size should be even) */
5280 if (q == e || consumed)
5281 goto End;
5282 errmsg = "truncated data";
5283 startinpos = ((const char *)q) - starts;
5284 endinpos = ((const char *)e) - starts;
5285 break;
5286 /* The remaining input chars are ignored if the callback
5287 chooses to skip the input */
5288 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005289 q -= 2;
5290 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005291 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005292 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005293 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005294 endinpos = ((const char *)e) - starts;
5295 break;
5296 case 2:
5297 errmsg = "illegal encoding";
5298 startinpos = ((const char *)q) - 2 - starts;
5299 endinpos = startinpos + 2;
5300 break;
5301 case 3:
5302 errmsg = "illegal UTF-16 surrogate";
5303 startinpos = ((const char *)q) - 4 - starts;
5304 endinpos = startinpos + 2;
5305 break;
5306 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005307 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005308 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 continue;
5310 }
5311
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005313 errors,
5314 &errorHandler,
5315 "utf16", errmsg,
5316 &starts,
5317 (const char **)&e,
5318 &startinpos,
5319 &endinpos,
5320 &exc,
5321 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005322 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 }
5325
Antoine Pitrou63065d72012-05-15 23:48:04 +02005326End:
Walter Dörwald69652032004-09-07 20:24:22 +00005327 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330 Py_XDECREF(errorHandler);
5331 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005332 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005335 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336 Py_XDECREF(errorHandler);
5337 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 return NULL;
5339}
5340
Tim Peters772747b2001-08-09 22:21:55 +00005341PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005342_PyUnicode_EncodeUTF16(PyObject *str,
5343 const char *errors,
5344 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005346 enum PyUnicode_Kind kind;
5347 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005348 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005349 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005350 unsigned short *out;
5351 Py_ssize_t bytesize;
5352 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005353#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005354 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005355#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005356 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005357#endif
5358
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005359 if (!PyUnicode_Check(str)) {
5360 PyErr_BadArgument();
5361 return NULL;
5362 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005363 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005364 return NULL;
5365 kind = PyUnicode_KIND(str);
5366 data = PyUnicode_DATA(str);
5367 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005368
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005369 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005370 if (kind == PyUnicode_4BYTE_KIND) {
5371 const Py_UCS4 *in = (const Py_UCS4 *)data;
5372 const Py_UCS4 *end = in + len;
5373 while (in < end)
5374 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005375 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005376 }
5377 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005379 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005380 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 if (v == NULL)
5382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005384 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005385 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005386 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005388 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005390 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005391
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005392 switch (kind) {
5393 case PyUnicode_1BYTE_KIND: {
5394 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5395 break;
Tim Peters772747b2001-08-09 22:21:55 +00005396 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005397 case PyUnicode_2BYTE_KIND: {
5398 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5399 break;
Tim Peters772747b2001-08-09 22:21:55 +00005400 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005401 case PyUnicode_4BYTE_KIND: {
5402 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5403 break;
5404 }
5405 default:
5406 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005407 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005408
5409 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005410 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411}
5412
Alexander Belopolsky40018472011-02-26 01:02:56 +00005413PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005414PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5415 Py_ssize_t size,
5416 const char *errors,
5417 int byteorder)
5418{
5419 PyObject *result;
5420 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5421 if (tmp == NULL)
5422 return NULL;
5423 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5424 Py_DECREF(tmp);
5425 return result;
5426}
5427
5428PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005429PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005431 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432}
5433
5434/* --- Unicode Escape Codec ----------------------------------------------- */
5435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5437 if all the escapes in the string make it still a valid ASCII string.
5438 Returns -1 if any escapes were found which cause the string to
5439 pop out of ASCII range. Otherwise returns the length of the
5440 required buffer to hold the string.
5441 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005442static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005443length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5444{
5445 const unsigned char *p = (const unsigned char *)s;
5446 const unsigned char *end = p + size;
5447 Py_ssize_t length = 0;
5448
5449 if (size < 0)
5450 return -1;
5451
5452 for (; p < end; ++p) {
5453 if (*p > 127) {
5454 /* Non-ASCII */
5455 return -1;
5456 }
5457 else if (*p != '\\') {
5458 /* Normal character */
5459 ++length;
5460 }
5461 else {
5462 /* Backslash-escape, check next char */
5463 ++p;
5464 /* Escape sequence reaches till end of string or
5465 non-ASCII follow-up. */
5466 if (p >= end || *p > 127)
5467 return -1;
5468 switch (*p) {
5469 case '\n':
5470 /* backslash + \n result in zero characters */
5471 break;
5472 case '\\': case '\'': case '\"':
5473 case 'b': case 'f': case 't':
5474 case 'n': case 'r': case 'v': case 'a':
5475 ++length;
5476 break;
5477 case '0': case '1': case '2': case '3':
5478 case '4': case '5': case '6': case '7':
5479 case 'x': case 'u': case 'U': case 'N':
5480 /* these do not guarantee ASCII characters */
5481 return -1;
5482 default:
5483 /* count the backslash + the other character */
5484 length += 2;
5485 }
5486 }
5487 }
5488 return length;
5489}
5490
Fredrik Lundh06d12682001-01-24 07:59:11 +00005491static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005492
Alexander Belopolsky40018472011-02-26 01:02:56 +00005493PyObject *
5494PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005495 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005496 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005499 Py_ssize_t startinpos;
5500 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005501 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005503 char* message;
5504 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 PyObject *errorHandler = NULL;
5506 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005507 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005508
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005509 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005510 if (len == 0)
5511 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005512
5513 /* After length_of_escaped_ascii_string() there are two alternatives,
5514 either the string is pure ASCII with named escapes like \n, etc.
5515 and we determined it's exact size (common case)
5516 or it contains \x, \u, ... escape sequences. then we create a
5517 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005518 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005519 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005520 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521 }
5522 else {
5523 /* Escaped strings will always be longer than the resulting
5524 Unicode string, so we start with size here and then reduce the
5525 length after conversion to the true value.
5526 (but if the error callback returns a long replacement string
5527 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005528 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005529 }
5530
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005532 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005534
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 while (s < end) {
5536 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005537 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539
5540 /* Non-escape characters are interpreted as Unicode ordinals */
5541 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005542 x = (unsigned char)*s;
5543 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005544 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005545 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 continue;
5547 }
5548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005549 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 /* \ - Escapes */
5551 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005552 c = *s++;
5553 if (s > end)
5554 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005555
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005556 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005559#define WRITECHAR(ch) \
5560 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005561 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005562 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005563 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005564
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005566 case '\\': WRITECHAR('\\'); break;
5567 case '\'': WRITECHAR('\''); break;
5568 case '\"': WRITECHAR('\"'); break;
5569 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005571 case 'f': WRITECHAR('\014'); break;
5572 case 't': WRITECHAR('\t'); break;
5573 case 'n': WRITECHAR('\n'); break;
5574 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005575 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005576 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005578 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 case '0': case '1': case '2': case '3':
5582 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005583 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005584 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005585 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005586 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005587 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005589 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 break;
5591
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 /* hex escapes */
5593 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005595 digits = 2;
5596 message = "truncated \\xXX escape";
5597 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005601 digits = 4;
5602 message = "truncated \\uXXXX escape";
5603 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005606 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005607 digits = 8;
5608 message = "truncated \\UXXXXXXXX escape";
5609 hexescape:
5610 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005611 if (end - s < digits) {
5612 /* count only hex digits */
5613 for (; s < end; ++s) {
5614 c = (unsigned char)*s;
5615 if (!Py_ISXDIGIT(c))
5616 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005617 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005618 goto error;
5619 }
5620 for (; digits--; ++s) {
5621 c = (unsigned char)*s;
5622 if (!Py_ISXDIGIT(c))
5623 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005624 chr = (chr<<4) & ~0xF;
5625 if (c >= '0' && c <= '9')
5626 chr += c - '0';
5627 else if (c >= 'a' && c <= 'f')
5628 chr += 10 + c - 'a';
5629 else
5630 chr += 10 + c - 'A';
5631 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005632 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 /* _decoding_error will have already written into the
5634 target buffer. */
5635 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005636 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005637 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005638 message = "illegal Unicode character";
5639 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005640 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005641 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005642 break;
5643
Benjamin Peterson29060642009-01-31 22:14:21 +00005644 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005645 case 'N':
5646 message = "malformed \\N character escape";
5647 if (ucnhash_CAPI == NULL) {
5648 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005649 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5650 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005651 if (ucnhash_CAPI == NULL)
5652 goto ucnhashError;
5653 }
5654 if (*s == '{') {
5655 const char *start = s+1;
5656 /* look for the closing brace */
5657 while (*s != '}' && s < end)
5658 s++;
5659 if (s > start && s < end && *s == '}') {
5660 /* found a name. look it up in the unicode database */
5661 message = "unknown Unicode character name";
5662 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005663 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005664 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005665 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005666 goto store;
5667 }
5668 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005669 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005670
5671 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005672 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673 message = "\\ at end of string";
5674 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005675 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005676 }
5677 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005678 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005679 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005680 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005681 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005683 continue;
5684
5685 error:
5686 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005687 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005688 errors, &errorHandler,
5689 "unicodeescape", message,
5690 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005691 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005692 goto onError;
5693 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005697 Py_XDECREF(errorHandler);
5698 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005702 PyErr_SetString(
5703 PyExc_UnicodeError,
5704 "\\N escapes not supported (can't load unicodedata module)"
5705 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005706 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 Py_XDECREF(errorHandler);
5708 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005709 return NULL;
5710
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005712 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 Py_XDECREF(errorHandler);
5714 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 return NULL;
5716}
5717
5718/* Return a Unicode-Escape string version of the Unicode object.
5719
5720 If quotes is true, the string is enclosed in u"" or u'' quotes as
5721 appropriate.
5722
5723*/
5724
Alexander Belopolsky40018472011-02-26 01:02:56 +00005725PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005726PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005728 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005729 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005731 int kind;
5732 void *data;
5733 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
Ezio Melottie7f90372012-10-05 03:33:31 +03005735 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005736 escape.
5737
Ezio Melottie7f90372012-10-05 03:33:31 +03005738 For UCS1 strings it's '\xxx', 4 bytes per source character.
5739 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5740 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005741 */
5742
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005743 if (!PyUnicode_Check(unicode)) {
5744 PyErr_BadArgument();
5745 return NULL;
5746 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005747 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748 return NULL;
5749 len = PyUnicode_GET_LENGTH(unicode);
5750 kind = PyUnicode_KIND(unicode);
5751 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005752 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005753 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5754 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5755 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5756 }
5757
5758 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005759 return PyBytes_FromStringAndSize(NULL, 0);
5760
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005761 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005763
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005764 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005766 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 if (repr == NULL)
5769 return NULL;
5770
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005771 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005773 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005774 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005775
Walter Dörwald79e913e2007-05-12 11:08:06 +00005776 /* Escape backslashes */
5777 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 *p++ = '\\';
5779 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005780 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005781 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005782
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005783 /* Map 21-bit characters to '\U00xxxxxx' */
5784 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005785 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005786 *p++ = '\\';
5787 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005788 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5789 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5790 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5791 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5792 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5793 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5794 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5795 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005797 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005798
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005800 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 *p++ = '\\';
5802 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005803 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5804 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5805 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5806 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005808
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005809 /* Map special whitespace to '\t', \n', '\r' */
5810 else if (ch == '\t') {
5811 *p++ = '\\';
5812 *p++ = 't';
5813 }
5814 else if (ch == '\n') {
5815 *p++ = '\\';
5816 *p++ = 'n';
5817 }
5818 else if (ch == '\r') {
5819 *p++ = '\\';
5820 *p++ = 'r';
5821 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005822
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005823 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005824 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005826 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005827 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5828 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005829 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005830
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 /* Copy everything else as-is */
5832 else
5833 *p++ = (char) ch;
5834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005836 assert(p - PyBytes_AS_STRING(repr) > 0);
5837 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5838 return NULL;
5839 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840}
5841
Alexander Belopolsky40018472011-02-26 01:02:56 +00005842PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005843PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5844 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005846 PyObject *result;
5847 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5848 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005850 result = PyUnicode_AsUnicodeEscapeString(tmp);
5851 Py_DECREF(tmp);
5852 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853}
5854
5855/* --- Raw Unicode Escape Codec ------------------------------------------- */
5856
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyObject *
5858PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005859 Py_ssize_t size,
5860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 Py_ssize_t startinpos;
5864 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005865 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 const char *end;
5867 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 PyObject *errorHandler = NULL;
5869 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005870
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005871 if (size == 0)
5872 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005873
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 /* Escaped strings will always be longer than the resulting
5875 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 length after conversion to the true value. (But decoding error
5877 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005878 _PyUnicodeWriter_Init(&writer);
5879 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005880
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 end = s + size;
5882 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 unsigned char c;
5884 Py_UCS4 x;
5885 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005886 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 /* Non-escape characters are interpreted as Unicode ordinals */
5889 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005890 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005891 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005894 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 startinpos = s-starts;
5896
5897 /* \u-escapes are only interpreted iff the number of leading
5898 backslashes if odd */
5899 bs = s;
5900 for (;s < end;) {
5901 if (*s != '\\')
5902 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005903 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005904 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005905 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 }
5907 if (((s - bs) & 1) == 0 ||
5908 s >= end ||
5909 (*s != 'u' && *s != 'U')) {
5910 continue;
5911 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005912 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 count = *s=='u' ? 4 : 8;
5914 s++;
5915
5916 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 for (x = 0, i = 0; i < count; ++i, ++s) {
5918 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005919 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005920 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005921 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 errors, &errorHandler,
5923 "rawunicodeescape", "truncated \\uXXXX",
5924 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005925 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 goto onError;
5927 goto nextByte;
5928 }
5929 x = (x<<4) & ~0xF;
5930 if (c >= '0' && c <= '9')
5931 x += c - '0';
5932 else if (c >= 'a' && c <= 'f')
5933 x += 10 + c - 'a';
5934 else
5935 x += 10 + c - 'A';
5936 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005937 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005938 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005939 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005940 }
5941 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005942 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005943 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005944 errors, &errorHandler,
5945 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005947 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005949 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 nextByte:
5951 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953 Py_XDECREF(errorHandler);
5954 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005955 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005956
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005958 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 Py_XDECREF(errorHandler);
5960 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 return NULL;
5962}
5963
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005964
Alexander Belopolsky40018472011-02-26 01:02:56 +00005965PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005966PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005968 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 char *p;
5970 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005971 Py_ssize_t expandsize, pos;
5972 int kind;
5973 void *data;
5974 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005976 if (!PyUnicode_Check(unicode)) {
5977 PyErr_BadArgument();
5978 return NULL;
5979 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005980 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005981 return NULL;
5982 kind = PyUnicode_KIND(unicode);
5983 data = PyUnicode_DATA(unicode);
5984 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005985 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5986 bytes, and 1 byte characters 4. */
5987 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005988
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005989 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005991
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005992 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 if (repr == NULL)
5994 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005995 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005996 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005998 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005999 for (pos = 0; pos < len; pos++) {
6000 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 /* Map 32-bit characters to '\Uxxxxxxxx' */
6002 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006003 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006004 *p++ = '\\';
6005 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006006 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6007 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6008 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6009 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6010 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6011 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6012 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6013 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006014 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006016 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 *p++ = '\\';
6018 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006019 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6020 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6021 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6022 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 /* Copy everything else as-is */
6025 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 *p++ = (char) ch;
6027 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006028
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006029 assert(p > q);
6030 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006031 return NULL;
6032 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033}
6034
Alexander Belopolsky40018472011-02-26 01:02:56 +00006035PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006036PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6037 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006039 PyObject *result;
6040 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6041 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006042 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006043 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6044 Py_DECREF(tmp);
6045 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046}
6047
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006048/* --- Unicode Internal Codec ------------------------------------------- */
6049
Alexander Belopolsky40018472011-02-26 01:02:56 +00006050PyObject *
6051_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006052 Py_ssize_t size,
6053 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006054{
6055 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006056 Py_ssize_t startinpos;
6057 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006058 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006059 const char *end;
6060 const char *reason;
6061 PyObject *errorHandler = NULL;
6062 PyObject *exc = NULL;
6063
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006064 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006065 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006066 1))
6067 return NULL;
6068
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006069 if (size == 0)
6070 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006071
Victor Stinner8f674cc2013-04-17 23:02:17 +02006072 _PyUnicodeWriter_Init(&writer);
6073 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6074 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006076 }
6077 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006078
Victor Stinner8f674cc2013-04-17 23:02:17 +02006079 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006080 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006081 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006082 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006083 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006084 endinpos = end-starts;
6085 reason = "truncated input";
6086 goto error;
6087 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006088 /* We copy the raw representation one byte at a time because the
6089 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006090 ((char *) &uch)[0] = s[0];
6091 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006092#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006093 ((char *) &uch)[2] = s[2];
6094 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006095#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006096 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006097#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006098 /* We have to sanity check the raw data, otherwise doom looms for
6099 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006100 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006101 endinpos = s - starts + Py_UNICODE_SIZE;
6102 reason = "illegal code point (> 0x10FFFF)";
6103 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006104 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006105#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006106 s += Py_UNICODE_SIZE;
6107#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006108 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006109 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006110 Py_UNICODE uch2;
6111 ((char *) &uch2)[0] = s[0];
6112 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006113 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006114 {
Victor Stinner551ac952011-11-29 22:58:13 +01006115 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006116 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006117 }
6118 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006119#endif
6120
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006121 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006122 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006123 continue;
6124
6125 error:
6126 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006127 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006128 errors, &errorHandler,
6129 "unicode_internal", reason,
6130 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006131 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006132 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006133 }
6134
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006135 Py_XDECREF(errorHandler);
6136 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006137 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006138
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006140 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
6143 return NULL;
6144}
6145
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146/* --- Latin-1 Codec ------------------------------------------------------ */
6147
Alexander Belopolsky40018472011-02-26 01:02:56 +00006148PyObject *
6149PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006150 Py_ssize_t size,
6151 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006154 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155}
6156
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006157/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006158static void
6159make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006160 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006161 PyObject *unicode,
6162 Py_ssize_t startpos, Py_ssize_t endpos,
6163 const char *reason)
6164{
6165 if (*exceptionObject == NULL) {
6166 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006167 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006168 encoding, unicode, startpos, endpos, reason);
6169 }
6170 else {
6171 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6172 goto onError;
6173 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6174 goto onError;
6175 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6176 goto onError;
6177 return;
6178 onError:
6179 Py_DECREF(*exceptionObject);
6180 *exceptionObject = NULL;
6181 }
6182}
6183
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006184/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006185static void
6186raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006187 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006188 PyObject *unicode,
6189 Py_ssize_t startpos, Py_ssize_t endpos,
6190 const char *reason)
6191{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006192 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006193 encoding, unicode, startpos, endpos, reason);
6194 if (*exceptionObject != NULL)
6195 PyCodec_StrictErrors(*exceptionObject);
6196}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197
6198/* error handling callback helper:
6199 build arguments, call the callback and check the arguments,
6200 put the result into newpos and return the replacement string, which
6201 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006202static PyObject *
6203unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006204 PyObject **errorHandler,
6205 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006206 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006207 Py_ssize_t startpos, Py_ssize_t endpos,
6208 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006210 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006211 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006212 PyObject *restuple;
6213 PyObject *resunicode;
6214
6215 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006219 }
6220
Benjamin Petersonbac79492012-01-14 13:34:47 -05006221 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006222 return NULL;
6223 len = PyUnicode_GET_LENGTH(unicode);
6224
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006225 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006226 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006227 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006229
6230 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006232 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006234 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006235 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 Py_DECREF(restuple);
6237 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006239 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 &resunicode, newpos)) {
6241 Py_DECREF(restuple);
6242 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006243 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006244 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6245 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6246 Py_DECREF(restuple);
6247 return NULL;
6248 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006249 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006250 *newpos = len + *newpos;
6251 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6253 Py_DECREF(restuple);
6254 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256 Py_INCREF(resunicode);
6257 Py_DECREF(restuple);
6258 return resunicode;
6259}
6260
Alexander Belopolsky40018472011-02-26 01:02:56 +00006261static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006262unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006263 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006264 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006265{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006266 /* input state */
6267 Py_ssize_t pos=0, size;
6268 int kind;
6269 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006270 /* output object */
6271 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 /* pointer into the output */
6273 char *str;
6274 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006275 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006276 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6277 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006278 PyObject *errorHandler = NULL;
6279 PyObject *exc = NULL;
6280 /* the following variable is used for caching string comparisons
6281 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6282 int known_errorHandler = -1;
6283
Benjamin Petersonbac79492012-01-14 13:34:47 -05006284 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006285 return NULL;
6286 size = PyUnicode_GET_LENGTH(unicode);
6287 kind = PyUnicode_KIND(unicode);
6288 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 /* allocate enough for a simple encoding without
6290 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006291 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006292 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006293 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006295 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006296 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006297 ressize = size;
6298
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006299 while (pos < size) {
6300 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 /* can we encode this? */
6303 if (c<limit) {
6304 /* no overflow check, because we know that the space is enough */
6305 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006306 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006307 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 Py_ssize_t requiredsize;
6310 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006311 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006313 Py_ssize_t collstart = pos;
6314 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006316 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 ++collend;
6318 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6319 if (known_errorHandler==-1) {
6320 if ((errors==NULL) || (!strcmp(errors, "strict")))
6321 known_errorHandler = 1;
6322 else if (!strcmp(errors, "replace"))
6323 known_errorHandler = 2;
6324 else if (!strcmp(errors, "ignore"))
6325 known_errorHandler = 3;
6326 else if (!strcmp(errors, "xmlcharrefreplace"))
6327 known_errorHandler = 4;
6328 else
6329 known_errorHandler = 0;
6330 }
6331 switch (known_errorHandler) {
6332 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006333 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 goto onError;
6335 case 2: /* replace */
6336 while (collstart++<collend)
6337 *str++ = '?'; /* fall through */
6338 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006339 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 break;
6341 case 4: /* xmlcharrefreplace */
6342 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006343 /* determine replacement size */
6344 for (i = collstart, repsize = 0; i < collend; ++i) {
6345 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6346 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006348 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006350 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006352 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006354 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006356 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006358 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006359 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006361 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006363 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 if (requiredsize > ressize) {
6365 if (requiredsize<2*ressize)
6366 requiredsize = 2*ressize;
6367 if (_PyBytes_Resize(&res, requiredsize))
6368 goto onError;
6369 str = PyBytes_AS_STRING(res) + respos;
6370 ressize = requiredsize;
6371 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006372 /* generate replacement */
6373 for (i = collstart; i < collend; ++i) {
6374 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006376 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 break;
6378 default:
6379 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006380 encoding, reason, unicode, &exc,
6381 collstart, collend, &newpos);
6382 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006383 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006385 if (PyBytes_Check(repunicode)) {
6386 /* Directly copy bytes result to output. */
6387 repsize = PyBytes_Size(repunicode);
6388 if (repsize > 1) {
6389 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006390 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006391 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6392 Py_DECREF(repunicode);
6393 goto onError;
6394 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006395 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006396 ressize += repsize-1;
6397 }
6398 memcpy(str, PyBytes_AsString(repunicode), repsize);
6399 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006400 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006401 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006402 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006403 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 /* need more space? (at least enough for what we
6405 have+the replacement+the rest of the string, so
6406 we won't have to check space for encodable characters) */
6407 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 repsize = PyUnicode_GET_LENGTH(repunicode);
6409 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 if (requiredsize > ressize) {
6411 if (requiredsize<2*ressize)
6412 requiredsize = 2*ressize;
6413 if (_PyBytes_Resize(&res, requiredsize)) {
6414 Py_DECREF(repunicode);
6415 goto onError;
6416 }
6417 str = PyBytes_AS_STRING(res) + respos;
6418 ressize = requiredsize;
6419 }
6420 /* check if there is anything unencodable in the replacement
6421 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006422 for (i = 0; repsize-->0; ++i, ++str) {
6423 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006425 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006426 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 Py_DECREF(repunicode);
6428 goto onError;
6429 }
6430 *str = (char)c;
6431 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006432 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006433 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006434 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006435 }
6436 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006437 /* Resize if we allocated to much */
6438 size = str - PyBytes_AS_STRING(res);
6439 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006440 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006441 if (_PyBytes_Resize(&res, size) < 0)
6442 goto onError;
6443 }
6444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445 Py_XDECREF(errorHandler);
6446 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006447 return res;
6448
6449 onError:
6450 Py_XDECREF(res);
6451 Py_XDECREF(errorHandler);
6452 Py_XDECREF(exc);
6453 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454}
6455
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006456/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006457PyObject *
6458PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006459 Py_ssize_t size,
6460 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006462 PyObject *result;
6463 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6464 if (unicode == NULL)
6465 return NULL;
6466 result = unicode_encode_ucs1(unicode, errors, 256);
6467 Py_DECREF(unicode);
6468 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469}
6470
Alexander Belopolsky40018472011-02-26 01:02:56 +00006471PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006472_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473{
6474 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 PyErr_BadArgument();
6476 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006478 if (PyUnicode_READY(unicode) == -1)
6479 return NULL;
6480 /* Fast path: if it is a one-byte string, construct
6481 bytes object directly. */
6482 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6483 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6484 PyUnicode_GET_LENGTH(unicode));
6485 /* Non-Latin-1 characters present. Defer to above function to
6486 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006487 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006488}
6489
6490PyObject*
6491PyUnicode_AsLatin1String(PyObject *unicode)
6492{
6493 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494}
6495
6496/* --- 7-bit ASCII Codec -------------------------------------------------- */
6497
Alexander Belopolsky40018472011-02-26 01:02:56 +00006498PyObject *
6499PyUnicode_DecodeASCII(const char *s,
6500 Py_ssize_t size,
6501 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006503 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006504 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006505 int kind;
6506 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006507 Py_ssize_t startinpos;
6508 Py_ssize_t endinpos;
6509 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006510 const char *e;
6511 PyObject *errorHandler = NULL;
6512 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006513
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006515 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006516
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006518 if (size == 1 && (unsigned char)s[0] < 128)
6519 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006520
Victor Stinner8f674cc2013-04-17 23:02:17 +02006521 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006522 writer.min_length = size;
6523 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006524 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006527 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006528 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006529 writer.pos = outpos;
6530 if (writer.pos == size)
6531 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006532
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006533 s += writer.pos;
6534 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006535 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006536 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006538 PyUnicode_WRITE(kind, data, writer.pos, c);
6539 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 ++s;
6541 }
6542 else {
6543 startinpos = s-starts;
6544 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006545 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 errors, &errorHandler,
6547 "ascii", "ordinal not in range(128)",
6548 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006549 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006551 kind = writer.kind;
6552 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006555 Py_XDECREF(errorHandler);
6556 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006557 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006558
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006560 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006561 Py_XDECREF(errorHandler);
6562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 return NULL;
6564}
6565
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006566/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006567PyObject *
6568PyUnicode_EncodeASCII(const Py_UNICODE *p,
6569 Py_ssize_t size,
6570 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006572 PyObject *result;
6573 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6574 if (unicode == NULL)
6575 return NULL;
6576 result = unicode_encode_ucs1(unicode, errors, 128);
6577 Py_DECREF(unicode);
6578 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579}
6580
Alexander Belopolsky40018472011-02-26 01:02:56 +00006581PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006582_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
6584 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 PyErr_BadArgument();
6586 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006588 if (PyUnicode_READY(unicode) == -1)
6589 return NULL;
6590 /* Fast path: if it is an ASCII-only string, construct bytes object
6591 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006592 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006593 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6594 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006596}
6597
6598PyObject *
6599PyUnicode_AsASCIIString(PyObject *unicode)
6600{
6601 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602}
6603
Victor Stinner99b95382011-07-04 14:23:54 +02006604#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006605
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006606/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006607
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006608#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006609#define NEED_RETRY
6610#endif
6611
Victor Stinner3a50e702011-10-18 21:21:00 +02006612#ifndef WC_ERR_INVALID_CHARS
6613# define WC_ERR_INVALID_CHARS 0x0080
6614#endif
6615
6616static char*
6617code_page_name(UINT code_page, PyObject **obj)
6618{
6619 *obj = NULL;
6620 if (code_page == CP_ACP)
6621 return "mbcs";
6622 if (code_page == CP_UTF7)
6623 return "CP_UTF7";
6624 if (code_page == CP_UTF8)
6625 return "CP_UTF8";
6626
6627 *obj = PyBytes_FromFormat("cp%u", code_page);
6628 if (*obj == NULL)
6629 return NULL;
6630 return PyBytes_AS_STRING(*obj);
6631}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006632
Alexander Belopolsky40018472011-02-26 01:02:56 +00006633static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006634is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006635{
6636 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006637 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638
Victor Stinner3a50e702011-10-18 21:21:00 +02006639 if (!IsDBCSLeadByteEx(code_page, *curr))
6640 return 0;
6641
6642 prev = CharPrevExA(code_page, s, curr, 0);
6643 if (prev == curr)
6644 return 1;
6645 /* FIXME: This code is limited to "true" double-byte encodings,
6646 as it assumes an incomplete character consists of a single
6647 byte. */
6648 if (curr - prev == 2)
6649 return 1;
6650 if (!IsDBCSLeadByteEx(code_page, *prev))
6651 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006652 return 0;
6653}
6654
Victor Stinner3a50e702011-10-18 21:21:00 +02006655static DWORD
6656decode_code_page_flags(UINT code_page)
6657{
6658 if (code_page == CP_UTF7) {
6659 /* The CP_UTF7 decoder only supports flags=0 */
6660 return 0;
6661 }
6662 else
6663 return MB_ERR_INVALID_CHARS;
6664}
6665
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006666/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006667 * Decode a byte string from a Windows code page into unicode object in strict
6668 * mode.
6669 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006670 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6671 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006672 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006673static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006674decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006675 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006676 const char *in,
6677 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006678{
Victor Stinner3a50e702011-10-18 21:21:00 +02006679 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006680 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006681 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006682
6683 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006684 assert(insize > 0);
6685 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6686 if (outsize <= 0)
6687 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006688
6689 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006691 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006692 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 if (*v == NULL)
6694 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006695 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006696 }
6697 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006699 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006700 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006702 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006703 }
6704
6705 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006706 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6707 if (outsize <= 0)
6708 goto error;
6709 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006710
Victor Stinner3a50e702011-10-18 21:21:00 +02006711error:
6712 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6713 return -2;
6714 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006715 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006716}
6717
Victor Stinner3a50e702011-10-18 21:21:00 +02006718/*
6719 * Decode a byte string from a code page into unicode object with an error
6720 * handler.
6721 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006722 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006723 * UnicodeDecodeError exception and returns -1 on error.
6724 */
6725static int
6726decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006727 PyObject **v,
6728 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006729 const char *errors)
6730{
6731 const char *startin = in;
6732 const char *endin = in + size;
6733 const DWORD flags = decode_code_page_flags(code_page);
6734 /* Ideally, we should get reason from FormatMessage. This is the Windows
6735 2000 English version of the message. */
6736 const char *reason = "No mapping for the Unicode character exists "
6737 "in the target code page.";
6738 /* each step cannot decode more than 1 character, but a character can be
6739 represented as a surrogate pair */
6740 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006741 int insize;
6742 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006743 PyObject *errorHandler = NULL;
6744 PyObject *exc = NULL;
6745 PyObject *encoding_obj = NULL;
6746 char *encoding;
6747 DWORD err;
6748 int ret = -1;
6749
6750 assert(size > 0);
6751
6752 encoding = code_page_name(code_page, &encoding_obj);
6753 if (encoding == NULL)
6754 return -1;
6755
6756 if (errors == NULL || strcmp(errors, "strict") == 0) {
6757 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6758 UnicodeDecodeError. */
6759 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6760 if (exc != NULL) {
6761 PyCodec_StrictErrors(exc);
6762 Py_CLEAR(exc);
6763 }
6764 goto error;
6765 }
6766
6767 if (*v == NULL) {
6768 /* Create unicode object */
6769 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6770 PyErr_NoMemory();
6771 goto error;
6772 }
Victor Stinnerab595942011-12-17 04:59:06 +01006773 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006774 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006775 if (*v == NULL)
6776 goto error;
6777 startout = PyUnicode_AS_UNICODE(*v);
6778 }
6779 else {
6780 /* Extend unicode object */
6781 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6782 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6783 PyErr_NoMemory();
6784 goto error;
6785 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006786 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006787 goto error;
6788 startout = PyUnicode_AS_UNICODE(*v) + n;
6789 }
6790
6791 /* Decode the byte string character per character */
6792 out = startout;
6793 while (in < endin)
6794 {
6795 /* Decode a character */
6796 insize = 1;
6797 do
6798 {
6799 outsize = MultiByteToWideChar(code_page, flags,
6800 in, insize,
6801 buffer, Py_ARRAY_LENGTH(buffer));
6802 if (outsize > 0)
6803 break;
6804 err = GetLastError();
6805 if (err != ERROR_NO_UNICODE_TRANSLATION
6806 && err != ERROR_INSUFFICIENT_BUFFER)
6807 {
6808 PyErr_SetFromWindowsErr(0);
6809 goto error;
6810 }
6811 insize++;
6812 }
6813 /* 4=maximum length of a UTF-8 sequence */
6814 while (insize <= 4 && (in + insize) <= endin);
6815
6816 if (outsize <= 0) {
6817 Py_ssize_t startinpos, endinpos, outpos;
6818
6819 startinpos = in - startin;
6820 endinpos = startinpos + 1;
6821 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006822 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006823 errors, &errorHandler,
6824 encoding, reason,
6825 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006826 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006827 {
6828 goto error;
6829 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006830 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006831 }
6832 else {
6833 in += insize;
6834 memcpy(out, buffer, outsize * sizeof(wchar_t));
6835 out += outsize;
6836 }
6837 }
6838
6839 /* write a NUL character at the end */
6840 *out = 0;
6841
6842 /* Extend unicode object */
6843 outsize = out - startout;
6844 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006845 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006847 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006848
6849error:
6850 Py_XDECREF(encoding_obj);
6851 Py_XDECREF(errorHandler);
6852 Py_XDECREF(exc);
6853 return ret;
6854}
6855
Victor Stinner3a50e702011-10-18 21:21:00 +02006856static PyObject *
6857decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006858 const char *s, Py_ssize_t size,
6859 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006860{
Victor Stinner76a31a62011-11-04 00:05:13 +01006861 PyObject *v = NULL;
6862 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006863
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 if (code_page < 0) {
6865 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6866 return NULL;
6867 }
6868
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871
Victor Stinner76a31a62011-11-04 00:05:13 +01006872 do
6873 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006874#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006875 if (size > INT_MAX) {
6876 chunk_size = INT_MAX;
6877 final = 0;
6878 done = 0;
6879 }
6880 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006882 {
6883 chunk_size = (int)size;
6884 final = (consumed == NULL);
6885 done = 1;
6886 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887
Victor Stinner76a31a62011-11-04 00:05:13 +01006888 /* Skip trailing lead-byte unless 'final' is set */
6889 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6890 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006891
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 if (chunk_size == 0 && done) {
6893 if (v != NULL)
6894 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006895 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006896 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897
Victor Stinner76a31a62011-11-04 00:05:13 +01006898
6899 converted = decode_code_page_strict(code_page, &v,
6900 s, chunk_size);
6901 if (converted == -2)
6902 converted = decode_code_page_errors(code_page, &v,
6903 s, chunk_size,
6904 errors);
6905 assert(converted != 0);
6906
6907 if (converted < 0) {
6908 Py_XDECREF(v);
6909 return NULL;
6910 }
6911
6912 if (consumed)
6913 *consumed += converted;
6914
6915 s += converted;
6916 size -= converted;
6917 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006918
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006919 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006920}
6921
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006923PyUnicode_DecodeCodePageStateful(int code_page,
6924 const char *s,
6925 Py_ssize_t size,
6926 const char *errors,
6927 Py_ssize_t *consumed)
6928{
6929 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6930}
6931
6932PyObject *
6933PyUnicode_DecodeMBCSStateful(const char *s,
6934 Py_ssize_t size,
6935 const char *errors,
6936 Py_ssize_t *consumed)
6937{
6938 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6939}
6940
6941PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006942PyUnicode_DecodeMBCS(const char *s,
6943 Py_ssize_t size,
6944 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006945{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006946 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6947}
6948
Victor Stinner3a50e702011-10-18 21:21:00 +02006949static DWORD
6950encode_code_page_flags(UINT code_page, const char *errors)
6951{
6952 if (code_page == CP_UTF8) {
6953 if (winver.dwMajorVersion >= 6)
6954 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6955 and later */
6956 return WC_ERR_INVALID_CHARS;
6957 else
6958 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6959 return 0;
6960 }
6961 else if (code_page == CP_UTF7) {
6962 /* CP_UTF7 only supports flags=0 */
6963 return 0;
6964 }
6965 else {
6966 if (errors != NULL && strcmp(errors, "replace") == 0)
6967 return 0;
6968 else
6969 return WC_NO_BEST_FIT_CHARS;
6970 }
6971}
6972
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006973/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006974 * Encode a Unicode string to a Windows code page into a byte string in strict
6975 * mode.
6976 *
6977 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006978 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006980static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006981encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006982 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006983 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006984{
Victor Stinner554f3f02010-06-16 23:33:54 +00006985 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006986 BOOL *pusedDefaultChar = &usedDefaultChar;
6987 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006988 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006989 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006990 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006991 const DWORD flags = encode_code_page_flags(code_page, NULL);
6992 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006993 /* Create a substring so that we can get the UTF-16 representation
6994 of just the slice under consideration. */
6995 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996
Martin v. Löwis3d325192011-11-04 18:23:06 +01006997 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006998
Victor Stinner3a50e702011-10-18 21:21:00 +02006999 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007000 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007001 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007002 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007003
Victor Stinner2fc507f2011-11-04 20:06:39 +01007004 substring = PyUnicode_Substring(unicode, offset, offset+len);
7005 if (substring == NULL)
7006 return -1;
7007 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7008 if (p == NULL) {
7009 Py_DECREF(substring);
7010 return -1;
7011 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007012 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007013
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007014 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007016 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 NULL, 0,
7018 NULL, pusedDefaultChar);
7019 if (outsize <= 0)
7020 goto error;
7021 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007022 if (pusedDefaultChar && *pusedDefaultChar) {
7023 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007024 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007025 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007026
Victor Stinner3a50e702011-10-18 21:21:00 +02007027 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007029 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007030 if (*outbytes == NULL) {
7031 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007033 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007034 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035 }
7036 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007038 const Py_ssize_t n = PyBytes_Size(*outbytes);
7039 if (outsize > PY_SSIZE_T_MAX - n) {
7040 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007041 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007044 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7045 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007046 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007047 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007048 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007049 }
7050
7051 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007052 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007053 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007054 out, outsize,
7055 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007056 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007057 if (outsize <= 0)
7058 goto error;
7059 if (pusedDefaultChar && *pusedDefaultChar)
7060 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007061 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007062
Victor Stinner3a50e702011-10-18 21:21:00 +02007063error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007064 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7066 return -2;
7067 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007068 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007069}
7070
Victor Stinner3a50e702011-10-18 21:21:00 +02007071/*
7072 * Encode a Unicode string to a Windows code page into a byte string using a
7073 * error handler.
7074 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007075 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007076 * -1 on other error.
7077 */
7078static int
7079encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007080 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007081 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007082{
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007084 Py_ssize_t pos = unicode_offset;
7085 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 /* Ideally, we should get reason from FormatMessage. This is the Windows
7087 2000 English version of the message. */
7088 const char *reason = "invalid character";
7089 /* 4=maximum length of a UTF-8 sequence */
7090 char buffer[4];
7091 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7092 Py_ssize_t outsize;
7093 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 PyObject *errorHandler = NULL;
7095 PyObject *exc = NULL;
7096 PyObject *encoding_obj = NULL;
7097 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007098 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 PyObject *rep;
7100 int ret = -1;
7101
7102 assert(insize > 0);
7103
7104 encoding = code_page_name(code_page, &encoding_obj);
7105 if (encoding == NULL)
7106 return -1;
7107
7108 if (errors == NULL || strcmp(errors, "strict") == 0) {
7109 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7110 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007111 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 if (exc != NULL) {
7113 PyCodec_StrictErrors(exc);
7114 Py_DECREF(exc);
7115 }
7116 Py_XDECREF(encoding_obj);
7117 return -1;
7118 }
7119
7120 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7121 pusedDefaultChar = &usedDefaultChar;
7122 else
7123 pusedDefaultChar = NULL;
7124
7125 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7126 PyErr_NoMemory();
7127 goto error;
7128 }
7129 outsize = insize * Py_ARRAY_LENGTH(buffer);
7130
7131 if (*outbytes == NULL) {
7132 /* Create string object */
7133 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7134 if (*outbytes == NULL)
7135 goto error;
7136 out = PyBytes_AS_STRING(*outbytes);
7137 }
7138 else {
7139 /* Extend string object */
7140 Py_ssize_t n = PyBytes_Size(*outbytes);
7141 if (n > PY_SSIZE_T_MAX - outsize) {
7142 PyErr_NoMemory();
7143 goto error;
7144 }
7145 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7146 goto error;
7147 out = PyBytes_AS_STRING(*outbytes) + n;
7148 }
7149
7150 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007151 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007152 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007153 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7154 wchar_t chars[2];
7155 int charsize;
7156 if (ch < 0x10000) {
7157 chars[0] = (wchar_t)ch;
7158 charsize = 1;
7159 }
7160 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007161 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7162 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007163 charsize = 2;
7164 }
7165
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007167 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 buffer, Py_ARRAY_LENGTH(buffer),
7169 NULL, pusedDefaultChar);
7170 if (outsize > 0) {
7171 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7172 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007173 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 memcpy(out, buffer, outsize);
7175 out += outsize;
7176 continue;
7177 }
7178 }
7179 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7180 PyErr_SetFromWindowsErr(0);
7181 goto error;
7182 }
7183
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 rep = unicode_encode_call_errorhandler(
7185 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007186 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007187 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 if (rep == NULL)
7189 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007190 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007191
7192 if (PyBytes_Check(rep)) {
7193 outsize = PyBytes_GET_SIZE(rep);
7194 if (outsize != 1) {
7195 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7196 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7197 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7198 Py_DECREF(rep);
7199 goto error;
7200 }
7201 out = PyBytes_AS_STRING(*outbytes) + offset;
7202 }
7203 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7204 out += outsize;
7205 }
7206 else {
7207 Py_ssize_t i;
7208 enum PyUnicode_Kind kind;
7209 void *data;
7210
Benjamin Petersonbac79492012-01-14 13:34:47 -05007211 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 Py_DECREF(rep);
7213 goto error;
7214 }
7215
7216 outsize = PyUnicode_GET_LENGTH(rep);
7217 if (outsize != 1) {
7218 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7219 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7220 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7221 Py_DECREF(rep);
7222 goto error;
7223 }
7224 out = PyBytes_AS_STRING(*outbytes) + offset;
7225 }
7226 kind = PyUnicode_KIND(rep);
7227 data = PyUnicode_DATA(rep);
7228 for (i=0; i < outsize; i++) {
7229 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7230 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007231 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007232 encoding, unicode,
7233 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 "unable to encode error handler result to ASCII");
7235 Py_DECREF(rep);
7236 goto error;
7237 }
7238 *out = (unsigned char)ch;
7239 out++;
7240 }
7241 }
7242 Py_DECREF(rep);
7243 }
7244 /* write a NUL byte */
7245 *out = 0;
7246 outsize = out - PyBytes_AS_STRING(*outbytes);
7247 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7248 if (_PyBytes_Resize(outbytes, outsize) < 0)
7249 goto error;
7250 ret = 0;
7251
7252error:
7253 Py_XDECREF(encoding_obj);
7254 Py_XDECREF(errorHandler);
7255 Py_XDECREF(exc);
7256 return ret;
7257}
7258
Victor Stinner3a50e702011-10-18 21:21:00 +02007259static PyObject *
7260encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007261 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 const char *errors)
7263{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007264 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007266 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007267 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007268
Benjamin Petersonbac79492012-01-14 13:34:47 -05007269 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007270 return NULL;
7271 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007272
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 if (code_page < 0) {
7274 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7275 return NULL;
7276 }
7277
Martin v. Löwis3d325192011-11-04 18:23:06 +01007278 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007279 return PyBytes_FromStringAndSize(NULL, 0);
7280
Victor Stinner7581cef2011-11-03 22:32:33 +01007281 offset = 0;
7282 do
7283 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007284#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007285 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007286 chunks. */
7287 if (len > INT_MAX/2) {
7288 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007289 done = 0;
7290 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007291 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007292#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007293 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007294 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007295 done = 1;
7296 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007297
Victor Stinner76a31a62011-11-04 00:05:13 +01007298 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007299 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007300 errors);
7301 if (ret == -2)
7302 ret = encode_code_page_errors(code_page, &outbytes,
7303 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007304 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007305 if (ret < 0) {
7306 Py_XDECREF(outbytes);
7307 return NULL;
7308 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309
Victor Stinner7581cef2011-11-03 22:32:33 +01007310 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007311 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007312 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 return outbytes;
7315}
7316
7317PyObject *
7318PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7319 Py_ssize_t size,
7320 const char *errors)
7321{
Victor Stinner7581cef2011-11-03 22:32:33 +01007322 PyObject *unicode, *res;
7323 unicode = PyUnicode_FromUnicode(p, size);
7324 if (unicode == NULL)
7325 return NULL;
7326 res = encode_code_page(CP_ACP, unicode, errors);
7327 Py_DECREF(unicode);
7328 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007329}
7330
7331PyObject *
7332PyUnicode_EncodeCodePage(int code_page,
7333 PyObject *unicode,
7334 const char *errors)
7335{
Victor Stinner7581cef2011-11-03 22:32:33 +01007336 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007337}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007338
Alexander Belopolsky40018472011-02-26 01:02:56 +00007339PyObject *
7340PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007341{
7342 if (!PyUnicode_Check(unicode)) {
7343 PyErr_BadArgument();
7344 return NULL;
7345 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007346 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007347}
7348
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349#undef NEED_RETRY
7350
Victor Stinner99b95382011-07-04 14:23:54 +02007351#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007352
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353/* --- Character Mapping Codec -------------------------------------------- */
7354
Victor Stinnerfb161b12013-04-18 01:44:27 +02007355static int
7356charmap_decode_string(const char *s,
7357 Py_ssize_t size,
7358 PyObject *mapping,
7359 const char *errors,
7360 _PyUnicodeWriter *writer)
7361{
7362 const char *starts = s;
7363 const char *e;
7364 Py_ssize_t startinpos, endinpos;
7365 PyObject *errorHandler = NULL, *exc = NULL;
7366 Py_ssize_t maplen;
7367 enum PyUnicode_Kind mapkind;
7368 void *mapdata;
7369 Py_UCS4 x;
7370 unsigned char ch;
7371
7372 if (PyUnicode_READY(mapping) == -1)
7373 return -1;
7374
7375 maplen = PyUnicode_GET_LENGTH(mapping);
7376 mapdata = PyUnicode_DATA(mapping);
7377 mapkind = PyUnicode_KIND(mapping);
7378
7379 e = s + size;
7380
7381 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7382 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7383 * is disabled in encoding aliases, latin1 is preferred because
7384 * its implementation is faster. */
7385 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7386 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7387 Py_UCS4 maxchar = writer->maxchar;
7388
7389 assert (writer->kind == PyUnicode_1BYTE_KIND);
7390 while (s < e) {
7391 ch = *s;
7392 x = mapdata_ucs1[ch];
7393 if (x > maxchar) {
7394 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7395 goto onError;
7396 maxchar = writer->maxchar;
7397 outdata = (Py_UCS1 *)writer->data;
7398 }
7399 outdata[writer->pos] = x;
7400 writer->pos++;
7401 ++s;
7402 }
7403 return 0;
7404 }
7405
7406 while (s < e) {
7407 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7408 enum PyUnicode_Kind outkind = writer->kind;
7409 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7410 if (outkind == PyUnicode_1BYTE_KIND) {
7411 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7412 Py_UCS4 maxchar = writer->maxchar;
7413 while (s < e) {
7414 ch = *s;
7415 x = mapdata_ucs2[ch];
7416 if (x > maxchar)
7417 goto Error;
7418 outdata[writer->pos] = x;
7419 writer->pos++;
7420 ++s;
7421 }
7422 break;
7423 }
7424 else if (outkind == PyUnicode_2BYTE_KIND) {
7425 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7426 while (s < e) {
7427 ch = *s;
7428 x = mapdata_ucs2[ch];
7429 if (x == 0xFFFE)
7430 goto Error;
7431 outdata[writer->pos] = x;
7432 writer->pos++;
7433 ++s;
7434 }
7435 break;
7436 }
7437 }
7438 ch = *s;
7439
7440 if (ch < maplen)
7441 x = PyUnicode_READ(mapkind, mapdata, ch);
7442 else
7443 x = 0xfffe; /* invalid value */
7444Error:
7445 if (x == 0xfffe)
7446 {
7447 /* undefined mapping */
7448 startinpos = s-starts;
7449 endinpos = startinpos+1;
7450 if (unicode_decode_call_errorhandler_writer(
7451 errors, &errorHandler,
7452 "charmap", "character maps to <undefined>",
7453 &starts, &e, &startinpos, &endinpos, &exc, &s,
7454 writer)) {
7455 goto onError;
7456 }
7457 continue;
7458 }
7459
7460 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7461 goto onError;
7462 ++s;
7463 }
7464 Py_XDECREF(errorHandler);
7465 Py_XDECREF(exc);
7466 return 0;
7467
7468onError:
7469 Py_XDECREF(errorHandler);
7470 Py_XDECREF(exc);
7471 return -1;
7472}
7473
7474static int
7475charmap_decode_mapping(const char *s,
7476 Py_ssize_t size,
7477 PyObject *mapping,
7478 const char *errors,
7479 _PyUnicodeWriter *writer)
7480{
7481 const char *starts = s;
7482 const char *e;
7483 Py_ssize_t startinpos, endinpos;
7484 PyObject *errorHandler = NULL, *exc = NULL;
7485 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007486 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007487
7488 e = s + size;
7489
7490 while (s < e) {
7491 ch = *s;
7492
7493 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7494 key = PyLong_FromLong((long)ch);
7495 if (key == NULL)
7496 goto onError;
7497
7498 item = PyObject_GetItem(mapping, key);
7499 Py_DECREF(key);
7500 if (item == NULL) {
7501 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7502 /* No mapping found means: mapping is undefined. */
7503 PyErr_Clear();
7504 goto Undefined;
7505 } else
7506 goto onError;
7507 }
7508
7509 /* Apply mapping */
7510 if (item == Py_None)
7511 goto Undefined;
7512 if (PyLong_Check(item)) {
7513 long value = PyLong_AS_LONG(item);
7514 if (value == 0xFFFE)
7515 goto Undefined;
7516 if (value < 0 || value > MAX_UNICODE) {
7517 PyErr_Format(PyExc_TypeError,
7518 "character mapping must be in range(0x%lx)",
7519 (unsigned long)MAX_UNICODE + 1);
7520 goto onError;
7521 }
7522
7523 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7524 goto onError;
7525 }
7526 else if (PyUnicode_Check(item)) {
7527 if (PyUnicode_READY(item) == -1)
7528 goto onError;
7529 if (PyUnicode_GET_LENGTH(item) == 1) {
7530 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7531 if (value == 0xFFFE)
7532 goto Undefined;
7533 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7534 goto onError;
7535 }
7536 else {
7537 writer->overallocate = 1;
7538 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7539 goto onError;
7540 }
7541 }
7542 else {
7543 /* wrong return value */
7544 PyErr_SetString(PyExc_TypeError,
7545 "character mapping must return integer, None or str");
7546 goto onError;
7547 }
7548 Py_CLEAR(item);
7549 ++s;
7550 continue;
7551
7552Undefined:
7553 /* undefined mapping */
7554 Py_CLEAR(item);
7555 startinpos = s-starts;
7556 endinpos = startinpos+1;
7557 if (unicode_decode_call_errorhandler_writer(
7558 errors, &errorHandler,
7559 "charmap", "character maps to <undefined>",
7560 &starts, &e, &startinpos, &endinpos, &exc, &s,
7561 writer)) {
7562 goto onError;
7563 }
7564 }
7565 Py_XDECREF(errorHandler);
7566 Py_XDECREF(exc);
7567 return 0;
7568
7569onError:
7570 Py_XDECREF(item);
7571 Py_XDECREF(errorHandler);
7572 Py_XDECREF(exc);
7573 return -1;
7574}
7575
Alexander Belopolsky40018472011-02-26 01:02:56 +00007576PyObject *
7577PyUnicode_DecodeCharmap(const char *s,
7578 Py_ssize_t size,
7579 PyObject *mapping,
7580 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007582 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007583
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 /* Default to Latin-1 */
7585 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007589 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007590 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007591 writer.min_length = size;
7592 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007594
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007595 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007596 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7597 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007598 }
7599 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007600 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7601 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007603 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007604
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007606 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 return NULL;
7608}
7609
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007610/* Charmap encoding: the lookup table */
7611
Alexander Belopolsky40018472011-02-26 01:02:56 +00007612struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 PyObject_HEAD
7614 unsigned char level1[32];
7615 int count2, count3;
7616 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007617};
7618
7619static PyObject*
7620encoding_map_size(PyObject *obj, PyObject* args)
7621{
7622 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007623 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007625}
7626
7627static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 PyDoc_STR("Return the size (in bytes) of this object") },
7630 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007631};
7632
7633static void
7634encoding_map_dealloc(PyObject* o)
7635{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007636 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007637}
7638
7639static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007640 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 "EncodingMap", /*tp_name*/
7642 sizeof(struct encoding_map), /*tp_basicsize*/
7643 0, /*tp_itemsize*/
7644 /* methods */
7645 encoding_map_dealloc, /*tp_dealloc*/
7646 0, /*tp_print*/
7647 0, /*tp_getattr*/
7648 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007649 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 0, /*tp_repr*/
7651 0, /*tp_as_number*/
7652 0, /*tp_as_sequence*/
7653 0, /*tp_as_mapping*/
7654 0, /*tp_hash*/
7655 0, /*tp_call*/
7656 0, /*tp_str*/
7657 0, /*tp_getattro*/
7658 0, /*tp_setattro*/
7659 0, /*tp_as_buffer*/
7660 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7661 0, /*tp_doc*/
7662 0, /*tp_traverse*/
7663 0, /*tp_clear*/
7664 0, /*tp_richcompare*/
7665 0, /*tp_weaklistoffset*/
7666 0, /*tp_iter*/
7667 0, /*tp_iternext*/
7668 encoding_map_methods, /*tp_methods*/
7669 0, /*tp_members*/
7670 0, /*tp_getset*/
7671 0, /*tp_base*/
7672 0, /*tp_dict*/
7673 0, /*tp_descr_get*/
7674 0, /*tp_descr_set*/
7675 0, /*tp_dictoffset*/
7676 0, /*tp_init*/
7677 0, /*tp_alloc*/
7678 0, /*tp_new*/
7679 0, /*tp_free*/
7680 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007681};
7682
7683PyObject*
7684PyUnicode_BuildEncodingMap(PyObject* string)
7685{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007686 PyObject *result;
7687 struct encoding_map *mresult;
7688 int i;
7689 int need_dict = 0;
7690 unsigned char level1[32];
7691 unsigned char level2[512];
7692 unsigned char *mlevel1, *mlevel2, *mlevel3;
7693 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007694 int kind;
7695 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007696 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007697 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007698
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007699 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007700 PyErr_BadArgument();
7701 return NULL;
7702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 kind = PyUnicode_KIND(string);
7704 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007705 length = PyUnicode_GET_LENGTH(string);
7706 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007707 memset(level1, 0xFF, sizeof level1);
7708 memset(level2, 0xFF, sizeof level2);
7709
7710 /* If there isn't a one-to-one mapping of NULL to \0,
7711 or if there are non-BMP characters, we need to use
7712 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007713 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007714 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007715 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007716 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007717 ch = PyUnicode_READ(kind, data, i);
7718 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007719 need_dict = 1;
7720 break;
7721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007722 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007723 /* unmapped character */
7724 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007725 l1 = ch >> 11;
7726 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007727 if (level1[l1] == 0xFF)
7728 level1[l1] = count2++;
7729 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007730 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007731 }
7732
7733 if (count2 >= 0xFF || count3 >= 0xFF)
7734 need_dict = 1;
7735
7736 if (need_dict) {
7737 PyObject *result = PyDict_New();
7738 PyObject *key, *value;
7739 if (!result)
7740 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007741 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007742 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007743 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007744 if (!key || !value)
7745 goto failed1;
7746 if (PyDict_SetItem(result, key, value) == -1)
7747 goto failed1;
7748 Py_DECREF(key);
7749 Py_DECREF(value);
7750 }
7751 return result;
7752 failed1:
7753 Py_XDECREF(key);
7754 Py_XDECREF(value);
7755 Py_DECREF(result);
7756 return NULL;
7757 }
7758
7759 /* Create a three-level trie */
7760 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7761 16*count2 + 128*count3 - 1);
7762 if (!result)
7763 return PyErr_NoMemory();
7764 PyObject_Init(result, &EncodingMapType);
7765 mresult = (struct encoding_map*)result;
7766 mresult->count2 = count2;
7767 mresult->count3 = count3;
7768 mlevel1 = mresult->level1;
7769 mlevel2 = mresult->level23;
7770 mlevel3 = mresult->level23 + 16*count2;
7771 memcpy(mlevel1, level1, 32);
7772 memset(mlevel2, 0xFF, 16*count2);
7773 memset(mlevel3, 0, 128*count3);
7774 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007775 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007776 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007777 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7778 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007779 /* unmapped character */
7780 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007781 o1 = ch>>11;
7782 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783 i2 = 16*mlevel1[o1] + o2;
7784 if (mlevel2[i2] == 0xFF)
7785 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007786 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007787 i3 = 128*mlevel2[i2] + o3;
7788 mlevel3[i3] = i;
7789 }
7790 return result;
7791}
7792
7793static int
Victor Stinner22168992011-11-20 17:09:18 +01007794encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007795{
7796 struct encoding_map *map = (struct encoding_map*)mapping;
7797 int l1 = c>>11;
7798 int l2 = (c>>7) & 0xF;
7799 int l3 = c & 0x7F;
7800 int i;
7801
Victor Stinner22168992011-11-20 17:09:18 +01007802 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007804 if (c == 0)
7805 return 0;
7806 /* level 1*/
7807 i = map->level1[l1];
7808 if (i == 0xFF) {
7809 return -1;
7810 }
7811 /* level 2*/
7812 i = map->level23[16*i+l2];
7813 if (i == 0xFF) {
7814 return -1;
7815 }
7816 /* level 3 */
7817 i = map->level23[16*map->count2 + 128*i + l3];
7818 if (i == 0) {
7819 return -1;
7820 }
7821 return i;
7822}
7823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007824/* Lookup the character ch in the mapping. If the character
7825 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007826 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007827static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007828charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829{
Christian Heimes217cfd12007-12-02 14:31:20 +00007830 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007831 PyObject *x;
7832
7833 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007835 x = PyObject_GetItem(mapping, w);
7836 Py_DECREF(w);
7837 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7839 /* No mapping found means: mapping is undefined. */
7840 PyErr_Clear();
7841 x = Py_None;
7842 Py_INCREF(x);
7843 return x;
7844 } else
7845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007847 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007849 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 long value = PyLong_AS_LONG(x);
7851 if (value < 0 || value > 255) {
7852 PyErr_SetString(PyExc_TypeError,
7853 "character mapping must be in range(256)");
7854 Py_DECREF(x);
7855 return NULL;
7856 }
7857 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007859 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 /* wrong return value */
7863 PyErr_Format(PyExc_TypeError,
7864 "character mapping must return integer, bytes or None, not %.400s",
7865 x->ob_type->tp_name);
7866 Py_DECREF(x);
7867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 }
7869}
7870
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007872charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007874 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7875 /* exponentially overallocate to minimize reallocations */
7876 if (requiredsize < 2*outsize)
7877 requiredsize = 2*outsize;
7878 if (_PyBytes_Resize(outobj, requiredsize))
7879 return -1;
7880 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881}
7882
Benjamin Peterson14339b62009-01-31 16:36:08 +00007883typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007885} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007886/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007887 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007888 space is available. Return a new reference to the object that
7889 was put in the output buffer, or Py_None, if the mapping was undefined
7890 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007891 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007892static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007893charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007894 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007895{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007896 PyObject *rep;
7897 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007898 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007899
Christian Heimes90aa7642007-12-19 02:45:37 +00007900 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007903 if (res == -1)
7904 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 if (outsize<requiredsize)
7906 if (charmapencode_resize(outobj, outpos, requiredsize))
7907 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007908 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 outstart[(*outpos)++] = (char)res;
7910 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007911 }
7912
7913 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007914 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007916 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 Py_DECREF(rep);
7918 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007919 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 if (PyLong_Check(rep)) {
7921 Py_ssize_t requiredsize = *outpos+1;
7922 if (outsize<requiredsize)
7923 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7924 Py_DECREF(rep);
7925 return enc_EXCEPTION;
7926 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007927 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 else {
7931 const char *repchars = PyBytes_AS_STRING(rep);
7932 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7933 Py_ssize_t requiredsize = *outpos+repsize;
7934 if (outsize<requiredsize)
7935 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7936 Py_DECREF(rep);
7937 return enc_EXCEPTION;
7938 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007939 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 memcpy(outstart + *outpos, repchars, repsize);
7941 *outpos += repsize;
7942 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007944 Py_DECREF(rep);
7945 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946}
7947
7948/* handle an error in PyUnicode_EncodeCharmap
7949 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007950static int
7951charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007952 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007954 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007955 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956{
7957 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007958 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007959 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007960 enum PyUnicode_Kind kind;
7961 void *data;
7962 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007963 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007964 Py_ssize_t collstartpos = *inpos;
7965 Py_ssize_t collendpos = *inpos+1;
7966 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007967 char *encoding = "charmap";
7968 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007969 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007970 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007971 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972
Benjamin Petersonbac79492012-01-14 13:34:47 -05007973 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007974 return -1;
7975 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 /* find all unencodable characters */
7977 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007978 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007979 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007980 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007981 val = encoding_map_lookup(ch, mapping);
7982 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 break;
7984 ++collendpos;
7985 continue;
7986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007987
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007988 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7989 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 if (rep==NULL)
7991 return -1;
7992 else if (rep!=Py_None) {
7993 Py_DECREF(rep);
7994 break;
7995 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007996 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007998 }
7999 /* cache callback name lookup
8000 * (if not done yet, i.e. it's the first error) */
8001 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 if ((errors==NULL) || (!strcmp(errors, "strict")))
8003 *known_errorHandler = 1;
8004 else if (!strcmp(errors, "replace"))
8005 *known_errorHandler = 2;
8006 else if (!strcmp(errors, "ignore"))
8007 *known_errorHandler = 3;
8008 else if (!strcmp(errors, "xmlcharrefreplace"))
8009 *known_errorHandler = 4;
8010 else
8011 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008012 }
8013 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008014 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008015 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 return -1;
8017 case 2: /* replace */
8018 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 x = charmapencode_output('?', mapping, res, respos);
8020 if (x==enc_EXCEPTION) {
8021 return -1;
8022 }
8023 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008024 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 return -1;
8026 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008027 }
8028 /* fall through */
8029 case 3: /* ignore */
8030 *inpos = collendpos;
8031 break;
8032 case 4: /* xmlcharrefreplace */
8033 /* generate replacement (temporarily (mis)uses p) */
8034 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 char buffer[2+29+1+1];
8036 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008037 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 for (cp = buffer; *cp; ++cp) {
8039 x = charmapencode_output(*cp, mapping, res, respos);
8040 if (x==enc_EXCEPTION)
8041 return -1;
8042 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008043 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 return -1;
8045 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 }
8047 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008048 *inpos = collendpos;
8049 break;
8050 default:
8051 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008052 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008054 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008056 if (PyBytes_Check(repunicode)) {
8057 /* Directly copy bytes result to output. */
8058 Py_ssize_t outsize = PyBytes_Size(*res);
8059 Py_ssize_t requiredsize;
8060 repsize = PyBytes_Size(repunicode);
8061 requiredsize = *respos + repsize;
8062 if (requiredsize > outsize)
8063 /* Make room for all additional bytes. */
8064 if (charmapencode_resize(res, respos, requiredsize)) {
8065 Py_DECREF(repunicode);
8066 return -1;
8067 }
8068 memcpy(PyBytes_AsString(*res) + *respos,
8069 PyBytes_AsString(repunicode), repsize);
8070 *respos += repsize;
8071 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008072 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008073 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008075 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008076 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008077 Py_DECREF(repunicode);
8078 return -1;
8079 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008080 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008081 data = PyUnicode_DATA(repunicode);
8082 kind = PyUnicode_KIND(repunicode);
8083 for (index = 0; index < repsize; index++) {
8084 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8085 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008087 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 return -1;
8089 }
8090 else if (x==enc_FAILED) {
8091 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008092 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 return -1;
8094 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008095 }
8096 *inpos = newpos;
8097 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 }
8099 return 0;
8100}
8101
Alexander Belopolsky40018472011-02-26 01:02:56 +00008102PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103_PyUnicode_EncodeCharmap(PyObject *unicode,
8104 PyObject *mapping,
8105 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008107 /* output object */
8108 PyObject *res = NULL;
8109 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008110 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008111 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008113 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008114 PyObject *errorHandler = NULL;
8115 PyObject *exc = NULL;
8116 /* the following variable is used for caching string comparisons
8117 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8118 * 3=ignore, 4=xmlcharrefreplace */
8119 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008120 void *data;
8121 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122
Benjamin Petersonbac79492012-01-14 13:34:47 -05008123 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008124 return NULL;
8125 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008126 data = PyUnicode_DATA(unicode);
8127 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008128
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 /* Default to Latin-1 */
8130 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008131 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 /* allocate enough for a simple encoding without
8134 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008135 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136 if (res == NULL)
8137 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008138 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008142 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008144 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 if (x==enc_EXCEPTION) /* error */
8146 goto onError;
8147 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008148 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 &exc,
8150 &known_errorHandler, &errorHandler, errors,
8151 &res, &respos)) {
8152 goto onError;
8153 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008154 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 else
8156 /* done with this character => adjust input position */
8157 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008160 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008161 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008162 if (_PyBytes_Resize(&res, respos) < 0)
8163 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008164
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165 Py_XDECREF(exc);
8166 Py_XDECREF(errorHandler);
8167 return res;
8168
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008170 Py_XDECREF(res);
8171 Py_XDECREF(exc);
8172 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 return NULL;
8174}
8175
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008176/* Deprecated */
8177PyObject *
8178PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8179 Py_ssize_t size,
8180 PyObject *mapping,
8181 const char *errors)
8182{
8183 PyObject *result;
8184 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8185 if (unicode == NULL)
8186 return NULL;
8187 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8188 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008189 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008190}
8191
Alexander Belopolsky40018472011-02-26 01:02:56 +00008192PyObject *
8193PyUnicode_AsCharmapString(PyObject *unicode,
8194 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195{
8196 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 PyErr_BadArgument();
8198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008200 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201}
8202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008204static void
8205make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008206 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008207 Py_ssize_t startpos, Py_ssize_t endpos,
8208 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 *exceptionObject = _PyUnicodeTranslateError_Create(
8212 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 }
8214 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8216 goto onError;
8217 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8218 goto onError;
8219 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8220 goto onError;
8221 return;
8222 onError:
8223 Py_DECREF(*exceptionObject);
8224 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 }
8226}
8227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008228/* error handling callback helper:
8229 build arguments, call the callback and check the arguments,
8230 put the result into newpos and return the replacement string, which
8231 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008232static PyObject *
8233unicode_translate_call_errorhandler(const char *errors,
8234 PyObject **errorHandler,
8235 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008237 Py_ssize_t startpos, Py_ssize_t endpos,
8238 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008240 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008242 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 PyObject *restuple;
8244 PyObject *resunicode;
8245
8246 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008248 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250 }
8251
8252 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256
8257 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008262 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 Py_DECREF(restuple);
8264 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 }
8266 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 &resunicode, &i_newpos)) {
8268 Py_DECREF(restuple);
8269 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008271 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008273 else
8274 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008275 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8277 Py_DECREF(restuple);
8278 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008279 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 Py_INCREF(resunicode);
8281 Py_DECREF(restuple);
8282 return resunicode;
8283}
8284
8285/* Lookup the character ch in the mapping and put the result in result,
8286 which must be decrefed by the caller.
8287 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008288static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290{
Christian Heimes217cfd12007-12-02 14:31:20 +00008291 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 PyObject *x;
8293
8294 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296 x = PyObject_GetItem(mapping, w);
8297 Py_DECREF(w);
8298 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8300 /* No mapping found means: use 1:1 mapping. */
8301 PyErr_Clear();
8302 *result = NULL;
8303 return 0;
8304 } else
8305 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 }
8307 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 *result = x;
8309 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008311 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 long value = PyLong_AS_LONG(x);
8313 long max = PyUnicode_GetMax();
8314 if (value < 0 || value > max) {
8315 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008316 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 Py_DECREF(x);
8318 return -1;
8319 }
8320 *result = x;
8321 return 0;
8322 }
8323 else if (PyUnicode_Check(x)) {
8324 *result = x;
8325 return 0;
8326 }
8327 else {
8328 /* wrong return value */
8329 PyErr_SetString(PyExc_TypeError,
8330 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008331 Py_DECREF(x);
8332 return -1;
8333 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334}
8335/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 if not reallocate and adjust various state variables.
8337 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008338static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008343 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008344 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 /* exponentially overallocate to minimize reallocations */
8346 if (requiredsize < 2 * oldsize)
8347 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008348 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8349 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008351 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353 }
8354 return 0;
8355}
8356/* lookup the character, put the result in the output string and adjust
8357 various state variables. Return a new reference to the object that
8358 was put in the output buffer in *result, or Py_None, if the mapping was
8359 undefined (in which case no character was written).
8360 The called must decref result.
8361 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008363charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8364 PyObject *mapping, Py_UCS4 **output,
8365 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8369 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 }
8375 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008377 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 }
8381 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 Py_ssize_t repsize;
8383 if (PyUnicode_READY(*res) == -1)
8384 return -1;
8385 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 if (repsize==1) {
8387 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 }
8390 else if (repsize!=0) {
8391 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 Py_ssize_t requiredsize = *opos +
8393 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 Py_ssize_t i;
8396 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 for(i = 0; i < repsize; i++)
8399 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 }
8402 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 return 0;
8405}
8406
Alexander Belopolsky40018472011-02-26 01:02:56 +00008407PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408_PyUnicode_TranslateCharmap(PyObject *input,
8409 PyObject *mapping,
8410 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008412 /* input object */
8413 char *idata;
8414 Py_ssize_t size, i;
8415 int kind;
8416 /* output buffer */
8417 Py_UCS4 *output = NULL;
8418 Py_ssize_t osize;
8419 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 char *reason = "character maps to <undefined>";
8423 PyObject *errorHandler = NULL;
8424 PyObject *exc = NULL;
8425 /* the following variable is used for caching string comparisons
8426 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8427 * 3=ignore, 4=xmlcharrefreplace */
8428 int known_errorHandler = -1;
8429
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 PyErr_BadArgument();
8432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 if (PyUnicode_READY(input) == -1)
8436 return NULL;
8437 idata = (char*)PyUnicode_DATA(input);
8438 kind = PyUnicode_KIND(input);
8439 size = PyUnicode_GET_LENGTH(input);
8440 i = 0;
8441
8442 if (size == 0) {
8443 Py_INCREF(input);
8444 return input;
8445 }
8446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 /* allocate enough for a simple 1:1 translation without
8448 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 osize = size;
8450 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8451 opos = 0;
8452 if (output == NULL) {
8453 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 /* try to encode it */
8459 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 if (charmaptranslate_output(input, i, mapping,
8461 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 Py_XDECREF(x);
8463 goto onError;
8464 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008465 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 else { /* untranslatable character */
8469 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8470 Py_ssize_t repsize;
8471 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 Py_ssize_t collstart = i;
8475 Py_ssize_t collend = i+1;
8476 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 while (collend < size) {
8480 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 goto onError;
8482 Py_XDECREF(x);
8483 if (x!=Py_None)
8484 break;
8485 ++collend;
8486 }
8487 /* cache callback name lookup
8488 * (if not done yet, i.e. it's the first error) */
8489 if (known_errorHandler==-1) {
8490 if ((errors==NULL) || (!strcmp(errors, "strict")))
8491 known_errorHandler = 1;
8492 else if (!strcmp(errors, "replace"))
8493 known_errorHandler = 2;
8494 else if (!strcmp(errors, "ignore"))
8495 known_errorHandler = 3;
8496 else if (!strcmp(errors, "xmlcharrefreplace"))
8497 known_errorHandler = 4;
8498 else
8499 known_errorHandler = 0;
8500 }
8501 switch (known_errorHandler) {
8502 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008503 make_translate_exception(&exc,
8504 input, collstart, collend, reason);
8505 if (exc != NULL)
8506 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008507 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 case 2: /* replace */
8509 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 for (coll = collstart; coll<collend; coll++)
8511 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 /* fall through */
8513 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 break;
8516 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 /* generate replacement (temporarily (mis)uses i) */
8518 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 char buffer[2+29+1+1];
8520 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8522 if (charmaptranslate_makespace(&output, &osize,
8523 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 goto onError;
8525 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 break;
8530 default:
8531 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 reason, input, &exc,
8533 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008534 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008536 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008537 Py_DECREF(repunicode);
8538 goto onError;
8539 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 repsize = PyUnicode_GET_LENGTH(repunicode);
8542 if (charmaptranslate_makespace(&output, &osize,
8543 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 Py_DECREF(repunicode);
8545 goto onError;
8546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 for (uni2 = 0; repsize-->0; ++uni2)
8548 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8549 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008551 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008552 }
8553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8555 if (!res)
8556 goto onError;
8557 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 Py_XDECREF(exc);
8559 Py_XDECREF(errorHandler);
8560 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 Py_XDECREF(exc);
8565 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 return NULL;
8567}
8568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569/* Deprecated. Use PyUnicode_Translate instead. */
8570PyObject *
8571PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8572 Py_ssize_t size,
8573 PyObject *mapping,
8574 const char *errors)
8575{
Christian Heimes5f520f42012-09-11 14:03:25 +02008576 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8578 if (!unicode)
8579 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008580 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8581 Py_DECREF(unicode);
8582 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583}
8584
Alexander Belopolsky40018472011-02-26 01:02:56 +00008585PyObject *
8586PyUnicode_Translate(PyObject *str,
8587 PyObject *mapping,
8588 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589{
8590 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008591
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 str = PyUnicode_FromObject(str);
8593 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008594 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 Py_DECREF(str);
8597 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598}
Tim Petersced69f82003-09-16 20:30:58 +00008599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008601fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602{
8603 /* No need to call PyUnicode_READY(self) because this function is only
8604 called as a callback from fixup() which does it already. */
8605 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8606 const int kind = PyUnicode_KIND(self);
8607 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008608 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008609 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 Py_ssize_t i;
8611
8612 for (i = 0; i < len; ++i) {
8613 ch = PyUnicode_READ(kind, data, i);
8614 fixed = 0;
8615 if (ch > 127) {
8616 if (Py_UNICODE_ISSPACE(ch))
8617 fixed = ' ';
8618 else {
8619 const int decimal = Py_UNICODE_TODECIMAL(ch);
8620 if (decimal >= 0)
8621 fixed = '0' + decimal;
8622 }
8623 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008624 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008625 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 PyUnicode_WRITE(kind, data, i, fixed);
8627 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008628 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008629 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 }
8632
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008633 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634}
8635
8636PyObject *
8637_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8638{
8639 if (!PyUnicode_Check(unicode)) {
8640 PyErr_BadInternalCall();
8641 return NULL;
8642 }
8643 if (PyUnicode_READY(unicode) == -1)
8644 return NULL;
8645 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8646 /* If the string is already ASCII, just return the same string */
8647 Py_INCREF(unicode);
8648 return unicode;
8649 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008650 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651}
8652
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008653PyObject *
8654PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8655 Py_ssize_t length)
8656{
Victor Stinnerf0124502011-11-21 23:12:56 +01008657 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008658 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008659 Py_UCS4 maxchar;
8660 enum PyUnicode_Kind kind;
8661 void *data;
8662
Victor Stinner99d7ad02012-02-22 13:37:39 +01008663 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008664 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008665 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008666 if (ch > 127) {
8667 int decimal = Py_UNICODE_TODECIMAL(ch);
8668 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008669 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008670 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008671 }
8672 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008673
8674 /* Copy to a new string */
8675 decimal = PyUnicode_New(length, maxchar);
8676 if (decimal == NULL)
8677 return decimal;
8678 kind = PyUnicode_KIND(decimal);
8679 data = PyUnicode_DATA(decimal);
8680 /* Iterate over code points */
8681 for (i = 0; i < length; i++) {
8682 Py_UNICODE ch = s[i];
8683 if (ch > 127) {
8684 int decimal = Py_UNICODE_TODECIMAL(ch);
8685 if (decimal >= 0)
8686 ch = '0' + decimal;
8687 }
8688 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008690 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008691}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008692/* --- Decimal Encoder ---------------------------------------------------- */
8693
Alexander Belopolsky40018472011-02-26 01:02:56 +00008694int
8695PyUnicode_EncodeDecimal(Py_UNICODE *s,
8696 Py_ssize_t length,
8697 char *output,
8698 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008699{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008700 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008701 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008702 enum PyUnicode_Kind kind;
8703 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008704
8705 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 PyErr_BadArgument();
8707 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008708 }
8709
Victor Stinner42bf7752011-11-21 22:52:58 +01008710 unicode = PyUnicode_FromUnicode(s, length);
8711 if (unicode == NULL)
8712 return -1;
8713
Benjamin Petersonbac79492012-01-14 13:34:47 -05008714 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008715 Py_DECREF(unicode);
8716 return -1;
8717 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008718 kind = PyUnicode_KIND(unicode);
8719 data = PyUnicode_DATA(unicode);
8720
Victor Stinnerb84d7232011-11-22 01:50:07 +01008721 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008722 PyObject *exc;
8723 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008725 Py_ssize_t startpos;
8726
8727 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008728
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008730 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008731 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008733 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 decimal = Py_UNICODE_TODECIMAL(ch);
8735 if (decimal >= 0) {
8736 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008737 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 continue;
8739 }
8740 if (0 < ch && ch < 256) {
8741 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008742 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 continue;
8744 }
Victor Stinner6345be92011-11-25 20:09:01 +01008745
Victor Stinner42bf7752011-11-21 22:52:58 +01008746 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008747 exc = NULL;
8748 raise_encode_exception(&exc, "decimal", unicode,
8749 startpos, startpos+1,
8750 "invalid decimal Unicode string");
8751 Py_XDECREF(exc);
8752 Py_DECREF(unicode);
8753 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008754 }
8755 /* 0-terminate the output string */
8756 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008757 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008758 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008759}
8760
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761/* --- Helpers ------------------------------------------------------------ */
8762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008764any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 Py_ssize_t start,
8766 Py_ssize_t end)
8767{
8768 int kind1, kind2, kind;
8769 void *buf1, *buf2;
8770 Py_ssize_t len1, len2, result;
8771
8772 kind1 = PyUnicode_KIND(s1);
8773 kind2 = PyUnicode_KIND(s2);
8774 kind = kind1 > kind2 ? kind1 : kind2;
8775 buf1 = PyUnicode_DATA(s1);
8776 buf2 = PyUnicode_DATA(s2);
8777 if (kind1 != kind)
8778 buf1 = _PyUnicode_AsKind(s1, kind);
8779 if (!buf1)
8780 return -2;
8781 if (kind2 != kind)
8782 buf2 = _PyUnicode_AsKind(s2, kind);
8783 if (!buf2) {
8784 if (kind1 != kind) PyMem_Free(buf1);
8785 return -2;
8786 }
8787 len1 = PyUnicode_GET_LENGTH(s1);
8788 len2 = PyUnicode_GET_LENGTH(s2);
8789
Victor Stinner794d5672011-10-10 03:21:36 +02008790 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008791 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008792 case PyUnicode_1BYTE_KIND:
8793 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8794 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8795 else
8796 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8797 break;
8798 case PyUnicode_2BYTE_KIND:
8799 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8800 break;
8801 case PyUnicode_4BYTE_KIND:
8802 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8803 break;
8804 default:
8805 assert(0); result = -2;
8806 }
8807 }
8808 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008809 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008810 case PyUnicode_1BYTE_KIND:
8811 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8812 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8813 else
8814 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8815 break;
8816 case PyUnicode_2BYTE_KIND:
8817 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8818 break;
8819 case PyUnicode_4BYTE_KIND:
8820 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8821 break;
8822 default:
8823 assert(0); result = -2;
8824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 }
8826
8827 if (kind1 != kind)
8828 PyMem_Free(buf1);
8829 if (kind2 != kind)
8830 PyMem_Free(buf2);
8831
8832 return result;
8833}
8834
8835Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008836_PyUnicode_InsertThousandsGrouping(
8837 PyObject *unicode, Py_ssize_t index,
8838 Py_ssize_t n_buffer,
8839 void *digits, Py_ssize_t n_digits,
8840 Py_ssize_t min_width,
8841 const char *grouping, PyObject *thousands_sep,
8842 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843{
Victor Stinner41a863c2012-02-24 00:37:51 +01008844 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008845 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008846 Py_ssize_t thousands_sep_len;
8847 Py_ssize_t len;
8848
8849 if (unicode != NULL) {
8850 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008851 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008852 }
8853 else {
8854 kind = PyUnicode_1BYTE_KIND;
8855 data = NULL;
8856 }
8857 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8858 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8859 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8860 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008861 if (thousands_sep_kind < kind) {
8862 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8863 if (!thousands_sep_data)
8864 return -1;
8865 }
8866 else {
8867 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8868 if (!data)
8869 return -1;
8870 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008871 }
8872
Benjamin Petersonead6b532011-12-20 17:23:42 -06008873 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008875 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008876 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008877 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008878 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008879 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008880 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008881 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008882 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008883 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008884 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008885 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008887 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008888 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008889 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008890 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008891 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008893 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008894 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008895 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008896 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008897 break;
8898 default:
8899 assert(0);
8900 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008902 if (unicode != NULL && thousands_sep_kind != kind) {
8903 if (thousands_sep_kind < kind)
8904 PyMem_Free(thousands_sep_data);
8905 else
8906 PyMem_Free(data);
8907 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008908 if (unicode == NULL) {
8909 *maxchar = 127;
8910 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008911 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008912 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008913 }
8914 }
8915 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916}
8917
8918
Thomas Wouters477c8d52006-05-27 19:21:47 +00008919/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008920#define ADJUST_INDICES(start, end, len) \
8921 if (end > len) \
8922 end = len; \
8923 else if (end < 0) { \
8924 end += len; \
8925 if (end < 0) \
8926 end = 0; \
8927 } \
8928 if (start < 0) { \
8929 start += len; \
8930 if (start < 0) \
8931 start = 0; \
8932 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008933
Alexander Belopolsky40018472011-02-26 01:02:56 +00008934Py_ssize_t
8935PyUnicode_Count(PyObject *str,
8936 PyObject *substr,
8937 Py_ssize_t start,
8938 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008940 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008941 PyObject* str_obj;
8942 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 int kind1, kind2, kind;
8944 void *buf1 = NULL, *buf2 = NULL;
8945 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008946
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008947 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008948 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008950 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008951 if (!sub_obj) {
8952 Py_DECREF(str_obj);
8953 return -1;
8954 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008955 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008956 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 Py_DECREF(str_obj);
8958 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 }
Tim Petersced69f82003-09-16 20:30:58 +00008960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 kind1 = PyUnicode_KIND(str_obj);
8962 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008963 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008966 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008967 if (kind2 > kind) {
8968 Py_DECREF(sub_obj);
8969 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008970 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008971 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008972 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 if (!buf2)
8975 goto onError;
8976 len1 = PyUnicode_GET_LENGTH(str_obj);
8977 len2 = PyUnicode_GET_LENGTH(sub_obj);
8978
8979 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008980 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008982 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8983 result = asciilib_count(
8984 ((Py_UCS1*)buf1) + start, end - start,
8985 buf2, len2, PY_SSIZE_T_MAX
8986 );
8987 else
8988 result = ucs1lib_count(
8989 ((Py_UCS1*)buf1) + start, end - start,
8990 buf2, len2, PY_SSIZE_T_MAX
8991 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 break;
8993 case PyUnicode_2BYTE_KIND:
8994 result = ucs2lib_count(
8995 ((Py_UCS2*)buf1) + start, end - start,
8996 buf2, len2, PY_SSIZE_T_MAX
8997 );
8998 break;
8999 case PyUnicode_4BYTE_KIND:
9000 result = ucs4lib_count(
9001 ((Py_UCS4*)buf1) + start, end - start,
9002 buf2, len2, PY_SSIZE_T_MAX
9003 );
9004 break;
9005 default:
9006 assert(0); result = 0;
9007 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009008
9009 Py_DECREF(sub_obj);
9010 Py_DECREF(str_obj);
9011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 if (kind2 != kind)
9013 PyMem_Free(buf2);
9014
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 onError:
9017 Py_DECREF(sub_obj);
9018 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 if (kind2 != kind && buf2)
9020 PyMem_Free(buf2);
9021 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022}
9023
Alexander Belopolsky40018472011-02-26 01:02:56 +00009024Py_ssize_t
9025PyUnicode_Find(PyObject *str,
9026 PyObject *sub,
9027 Py_ssize_t start,
9028 Py_ssize_t end,
9029 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009031 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009032
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009034 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009036 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009037 if (!sub) {
9038 Py_DECREF(str);
9039 return -2;
9040 }
9041 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9042 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 Py_DECREF(str);
9044 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045 }
Tim Petersced69f82003-09-16 20:30:58 +00009046
Victor Stinner794d5672011-10-10 03:21:36 +02009047 result = any_find_slice(direction,
9048 str, sub, start, end
9049 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009050
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009052 Py_DECREF(sub);
9053
Guido van Rossumd57fd912000-03-10 22:53:23 +00009054 return result;
9055}
9056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057Py_ssize_t
9058PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9059 Py_ssize_t start, Py_ssize_t end,
9060 int direction)
9061{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009063 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 if (PyUnicode_READY(str) == -1)
9065 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009066 if (start < 0 || end < 0) {
9067 PyErr_SetString(PyExc_IndexError, "string index out of range");
9068 return -2;
9069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 if (end > PyUnicode_GET_LENGTH(str))
9071 end = PyUnicode_GET_LENGTH(str);
9072 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009073 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9074 kind, end-start, ch, direction);
9075 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009077 else
9078 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079}
9080
Alexander Belopolsky40018472011-02-26 01:02:56 +00009081static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009082tailmatch(PyObject *self,
9083 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009084 Py_ssize_t start,
9085 Py_ssize_t end,
9086 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 int kind_self;
9089 int kind_sub;
9090 void *data_self;
9091 void *data_sub;
9092 Py_ssize_t offset;
9093 Py_ssize_t i;
9094 Py_ssize_t end_sub;
9095
9096 if (PyUnicode_READY(self) == -1 ||
9097 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009098 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099
9100 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 return 1;
9102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9104 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 kind_self = PyUnicode_KIND(self);
9109 data_self = PyUnicode_DATA(self);
9110 kind_sub = PyUnicode_KIND(substring);
9111 data_sub = PyUnicode_DATA(substring);
9112 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9113
9114 if (direction > 0)
9115 offset = end;
9116 else
9117 offset = start;
9118
9119 if (PyUnicode_READ(kind_self, data_self, offset) ==
9120 PyUnicode_READ(kind_sub, data_sub, 0) &&
9121 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9122 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9123 /* If both are of the same kind, memcmp is sufficient */
9124 if (kind_self == kind_sub) {
9125 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009126 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 data_sub,
9128 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009129 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 }
9131 /* otherwise we have to compare each character by first accesing it */
9132 else {
9133 /* We do not need to compare 0 and len(substring)-1 because
9134 the if statement above ensured already that they are equal
9135 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 for (i = 1; i < end_sub; ++i) {
9137 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9138 PyUnicode_READ(kind_sub, data_sub, i))
9139 return 0;
9140 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143 }
9144
9145 return 0;
9146}
9147
Alexander Belopolsky40018472011-02-26 01:02:56 +00009148Py_ssize_t
9149PyUnicode_Tailmatch(PyObject *str,
9150 PyObject *substr,
9151 Py_ssize_t start,
9152 Py_ssize_t end,
9153 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009155 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009156
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 str = PyUnicode_FromObject(str);
9158 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160 substr = PyUnicode_FromObject(substr);
9161 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009162 Py_DECREF(str);
9163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164 }
Tim Petersced69f82003-09-16 20:30:58 +00009165
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009166 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168 Py_DECREF(str);
9169 Py_DECREF(substr);
9170 return result;
9171}
9172
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173/* Apply fixfct filter to the Unicode object self and return a
9174 reference to the modified object */
9175
Alexander Belopolsky40018472011-02-26 01:02:56 +00009176static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009177fixup(PyObject *self,
9178 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 PyObject *u;
9181 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009182 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009184 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009187 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 /* fix functions return the new maximum character in a string,
9190 if the kind of the resulting unicode object does not change,
9191 everything is fine. Otherwise we need to change the string kind
9192 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009193 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009194
9195 if (maxchar_new == 0) {
9196 /* no changes */;
9197 if (PyUnicode_CheckExact(self)) {
9198 Py_DECREF(u);
9199 Py_INCREF(self);
9200 return self;
9201 }
9202 else
9203 return u;
9204 }
9205
Victor Stinnere6abb482012-05-02 01:15:40 +02009206 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207
Victor Stinnereaab6042011-12-11 22:22:39 +01009208 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009210
9211 /* In case the maximum character changed, we need to
9212 convert the string to the new category. */
9213 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9214 if (v == NULL) {
9215 Py_DECREF(u);
9216 return NULL;
9217 }
9218 if (maxchar_new > maxchar_old) {
9219 /* If the maxchar increased so that the kind changed, not all
9220 characters are representable anymore and we need to fix the
9221 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009222 _PyUnicode_FastCopyCharacters(v, 0,
9223 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009224 maxchar_old = fixfct(v);
9225 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 }
9227 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009228 _PyUnicode_FastCopyCharacters(v, 0,
9229 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009231 Py_DECREF(u);
9232 assert(_PyUnicode_CheckConsistency(v, 1));
9233 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234}
9235
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009236static PyObject *
9237ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009239 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9240 char *resdata, *data = PyUnicode_DATA(self);
9241 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009242
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009243 res = PyUnicode_New(len, 127);
9244 if (res == NULL)
9245 return NULL;
9246 resdata = PyUnicode_DATA(res);
9247 if (lower)
9248 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009250 _Py_bytes_upper(resdata, data, len);
9251 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252}
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009255handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009257 Py_ssize_t j;
9258 int final_sigma;
9259 Py_UCS4 c;
9260 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009261
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009262 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9263
9264 where ! is a negation and \p{xxx} is a character with property xxx.
9265 */
9266 for (j = i - 1; j >= 0; j--) {
9267 c = PyUnicode_READ(kind, data, j);
9268 if (!_PyUnicode_IsCaseIgnorable(c))
9269 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009271 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9272 if (final_sigma) {
9273 for (j = i + 1; j < length; j++) {
9274 c = PyUnicode_READ(kind, data, j);
9275 if (!_PyUnicode_IsCaseIgnorable(c))
9276 break;
9277 }
9278 final_sigma = j == length || !_PyUnicode_IsCased(c);
9279 }
9280 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281}
9282
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009283static int
9284lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9285 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009287 /* Obscure special case. */
9288 if (c == 0x3A3) {
9289 mapped[0] = handle_capital_sigma(kind, data, length, i);
9290 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009292 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293}
9294
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009295static Py_ssize_t
9296do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009298 Py_ssize_t i, k = 0;
9299 int n_res, j;
9300 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009301
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009302 c = PyUnicode_READ(kind, data, 0);
9303 n_res = _PyUnicode_ToUpperFull(c, mapped);
9304 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009305 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009306 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009308 for (i = 1; i < length; i++) {
9309 c = PyUnicode_READ(kind, data, i);
9310 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9311 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009312 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009313 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009314 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009315 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009316 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317}
9318
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009319static Py_ssize_t
9320do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9321 Py_ssize_t i, k = 0;
9322
9323 for (i = 0; i < length; i++) {
9324 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9325 int n_res, j;
9326 if (Py_UNICODE_ISUPPER(c)) {
9327 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9328 }
9329 else if (Py_UNICODE_ISLOWER(c)) {
9330 n_res = _PyUnicode_ToUpperFull(c, mapped);
9331 }
9332 else {
9333 n_res = 1;
9334 mapped[0] = c;
9335 }
9336 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009337 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009338 res[k++] = mapped[j];
9339 }
9340 }
9341 return k;
9342}
9343
9344static Py_ssize_t
9345do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9346 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009348 Py_ssize_t i, k = 0;
9349
9350 for (i = 0; i < length; i++) {
9351 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9352 int n_res, j;
9353 if (lower)
9354 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9355 else
9356 n_res = _PyUnicode_ToUpperFull(c, mapped);
9357 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009358 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009359 res[k++] = mapped[j];
9360 }
9361 }
9362 return k;
9363}
9364
9365static Py_ssize_t
9366do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9367{
9368 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9369}
9370
9371static Py_ssize_t
9372do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9373{
9374 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9375}
9376
Benjamin Petersone51757f2012-01-12 21:10:29 -05009377static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009378do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9379{
9380 Py_ssize_t i, k = 0;
9381
9382 for (i = 0; i < length; i++) {
9383 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9384 Py_UCS4 mapped[3];
9385 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9386 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009387 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009388 res[k++] = mapped[j];
9389 }
9390 }
9391 return k;
9392}
9393
9394static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009395do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9396{
9397 Py_ssize_t i, k = 0;
9398 int previous_is_cased;
9399
9400 previous_is_cased = 0;
9401 for (i = 0; i < length; i++) {
9402 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9403 Py_UCS4 mapped[3];
9404 int n_res, j;
9405
9406 if (previous_is_cased)
9407 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9408 else
9409 n_res = _PyUnicode_ToTitleFull(c, mapped);
9410
9411 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009412 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009413 res[k++] = mapped[j];
9414 }
9415
9416 previous_is_cased = _PyUnicode_IsCased(c);
9417 }
9418 return k;
9419}
9420
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009421static PyObject *
9422case_operation(PyObject *self,
9423 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9424{
9425 PyObject *res = NULL;
9426 Py_ssize_t length, newlength = 0;
9427 int kind, outkind;
9428 void *data, *outdata;
9429 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9430
Benjamin Petersoneea48462012-01-16 14:28:50 -05009431 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009432
9433 kind = PyUnicode_KIND(self);
9434 data = PyUnicode_DATA(self);
9435 length = PyUnicode_GET_LENGTH(self);
9436 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9437 if (tmp == NULL)
9438 return PyErr_NoMemory();
9439 newlength = perform(kind, data, length, tmp, &maxchar);
9440 res = PyUnicode_New(newlength, maxchar);
9441 if (res == NULL)
9442 goto leave;
9443 tmpend = tmp + newlength;
9444 outdata = PyUnicode_DATA(res);
9445 outkind = PyUnicode_KIND(res);
9446 switch (outkind) {
9447 case PyUnicode_1BYTE_KIND:
9448 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9449 break;
9450 case PyUnicode_2BYTE_KIND:
9451 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9452 break;
9453 case PyUnicode_4BYTE_KIND:
9454 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9455 break;
9456 default:
9457 assert(0);
9458 break;
9459 }
9460 leave:
9461 PyMem_FREE(tmp);
9462 return res;
9463}
9464
Tim Peters8ce9f162004-08-27 01:49:32 +00009465PyObject *
9466PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009469 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009471 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009472 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9473 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009474 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009476 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009478 int use_memcpy;
9479 unsigned char *res_data = NULL, *sep_data = NULL;
9480 PyObject *last_obj;
9481 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482
Tim Peters05eba1f2004-08-27 21:32:02 +00009483 fseq = PySequence_Fast(seq, "");
9484 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009485 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009486 }
9487
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009488 /* NOTE: the following code can't call back into Python code,
9489 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009490 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009491
Tim Peters05eba1f2004-08-27 21:32:02 +00009492 seqlen = PySequence_Fast_GET_SIZE(fseq);
9493 /* If empty sequence, return u"". */
9494 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009495 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009496 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009497 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009498
Tim Peters05eba1f2004-08-27 21:32:02 +00009499 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009500 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009501 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009502 if (seqlen == 1) {
9503 if (PyUnicode_CheckExact(items[0])) {
9504 res = items[0];
9505 Py_INCREF(res);
9506 Py_DECREF(fseq);
9507 return res;
9508 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009509 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009510 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009511 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009512 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009513 /* Set up sep and seplen */
9514 if (separator == NULL) {
9515 /* fall back to a blank space separator */
9516 sep = PyUnicode_FromOrdinal(' ');
9517 if (!sep)
9518 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009519 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009520 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009521 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009522 else {
9523 if (!PyUnicode_Check(separator)) {
9524 PyErr_Format(PyExc_TypeError,
9525 "separator: expected str instance,"
9526 " %.80s found",
9527 Py_TYPE(separator)->tp_name);
9528 goto onError;
9529 }
9530 if (PyUnicode_READY(separator))
9531 goto onError;
9532 sep = separator;
9533 seplen = PyUnicode_GET_LENGTH(separator);
9534 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9535 /* inc refcount to keep this code path symmetric with the
9536 above case of a blank separator */
9537 Py_INCREF(sep);
9538 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009539 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009540 }
9541
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009542 /* There are at least two things to join, or else we have a subclass
9543 * of str in the sequence.
9544 * Do a pre-pass to figure out the total amount of space we'll
9545 * need (sz), and see whether all argument are strings.
9546 */
9547 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009548#ifdef Py_DEBUG
9549 use_memcpy = 0;
9550#else
9551 use_memcpy = 1;
9552#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009553 for (i = 0; i < seqlen; i++) {
9554 const Py_ssize_t old_sz = sz;
9555 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 if (!PyUnicode_Check(item)) {
9557 PyErr_Format(PyExc_TypeError,
9558 "sequence item %zd: expected str instance,"
9559 " %.80s found",
9560 i, Py_TYPE(item)->tp_name);
9561 goto onError;
9562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 if (PyUnicode_READY(item) == -1)
9564 goto onError;
9565 sz += PyUnicode_GET_LENGTH(item);
9566 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009567 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009568 if (i != 0)
9569 sz += seplen;
9570 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9571 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009573 goto onError;
9574 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009575 if (use_memcpy && last_obj != NULL) {
9576 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9577 use_memcpy = 0;
9578 }
9579 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009580 }
Tim Petersced69f82003-09-16 20:30:58 +00009581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009583 if (res == NULL)
9584 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009585
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009586 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009587#ifdef Py_DEBUG
9588 use_memcpy = 0;
9589#else
9590 if (use_memcpy) {
9591 res_data = PyUnicode_1BYTE_DATA(res);
9592 kind = PyUnicode_KIND(res);
9593 if (seplen != 0)
9594 sep_data = PyUnicode_1BYTE_DATA(sep);
9595 }
9596#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009597 if (use_memcpy) {
9598 for (i = 0; i < seqlen; ++i) {
9599 Py_ssize_t itemlen;
9600 item = items[i];
9601
9602 /* Copy item, and maybe the separator. */
9603 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009604 Py_MEMCPY(res_data,
9605 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009606 kind * seplen);
9607 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009608 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009609
9610 itemlen = PyUnicode_GET_LENGTH(item);
9611 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009612 Py_MEMCPY(res_data,
9613 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009614 kind * itemlen);
9615 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009616 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009617 }
9618 assert(res_data == PyUnicode_1BYTE_DATA(res)
9619 + kind * PyUnicode_GET_LENGTH(res));
9620 }
9621 else {
9622 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9623 Py_ssize_t itemlen;
9624 item = items[i];
9625
9626 /* Copy item, and maybe the separator. */
9627 if (i && seplen != 0) {
9628 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9629 res_offset += seplen;
9630 }
9631
9632 itemlen = PyUnicode_GET_LENGTH(item);
9633 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009634 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009635 res_offset += itemlen;
9636 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009637 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009638 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009639 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009640
Tim Peters05eba1f2004-08-27 21:32:02 +00009641 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009643 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645
Benjamin Peterson29060642009-01-31 22:14:21 +00009646 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009647 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009649 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650 return NULL;
9651}
9652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653#define FILL(kind, data, value, start, length) \
9654 do { \
9655 Py_ssize_t i_ = 0; \
9656 assert(kind != PyUnicode_WCHAR_KIND); \
9657 switch ((kind)) { \
9658 case PyUnicode_1BYTE_KIND: { \
9659 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009660 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 break; \
9662 } \
9663 case PyUnicode_2BYTE_KIND: { \
9664 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9665 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9666 break; \
9667 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009668 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9670 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9671 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009672 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 } \
9674 } \
9675 } while (0)
9676
Victor Stinnerd3f08822012-05-29 12:57:52 +02009677void
9678_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9679 Py_UCS4 fill_char)
9680{
9681 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9682 const void *data = PyUnicode_DATA(unicode);
9683 assert(PyUnicode_IS_READY(unicode));
9684 assert(unicode_modifiable(unicode));
9685 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9686 assert(start >= 0);
9687 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9688 FILL(kind, data, fill_char, start, length);
9689}
9690
Victor Stinner3fe55312012-01-04 00:33:50 +01009691Py_ssize_t
9692PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9693 Py_UCS4 fill_char)
9694{
9695 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009696
9697 if (!PyUnicode_Check(unicode)) {
9698 PyErr_BadInternalCall();
9699 return -1;
9700 }
9701 if (PyUnicode_READY(unicode) == -1)
9702 return -1;
9703 if (unicode_check_modifiable(unicode))
9704 return -1;
9705
Victor Stinnerd3f08822012-05-29 12:57:52 +02009706 if (start < 0) {
9707 PyErr_SetString(PyExc_IndexError, "string index out of range");
9708 return -1;
9709 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009710 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9711 PyErr_SetString(PyExc_ValueError,
9712 "fill character is bigger than "
9713 "the string maximum character");
9714 return -1;
9715 }
9716
9717 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9718 length = Py_MIN(maxlen, length);
9719 if (length <= 0)
9720 return 0;
9721
Victor Stinnerd3f08822012-05-29 12:57:52 +02009722 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009723 return length;
9724}
9725
Victor Stinner9310abb2011-10-05 00:59:23 +02009726static PyObject *
9727pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009728 Py_ssize_t left,
9729 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 PyObject *u;
9733 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009734 int kind;
9735 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736
9737 if (left < 0)
9738 left = 0;
9739 if (right < 0)
9740 right = 0;
9741
Victor Stinnerc4b49542011-12-11 22:44:26 +01009742 if (left == 0 && right == 0)
9743 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9746 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009747 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9748 return NULL;
9749 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009751 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009753 if (!u)
9754 return NULL;
9755
9756 kind = PyUnicode_KIND(u);
9757 data = PyUnicode_DATA(u);
9758 if (left)
9759 FILL(kind, data, fill, 0, left);
9760 if (right)
9761 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009762 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009763 assert(_PyUnicode_CheckConsistency(u, 1));
9764 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765}
9766
Alexander Belopolsky40018472011-02-26 01:02:56 +00009767PyObject *
9768PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771
9772 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009773 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009774 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009775 if (PyUnicode_READY(string) == -1) {
9776 Py_DECREF(string);
9777 return NULL;
9778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779
Benjamin Petersonead6b532011-12-20 17:23:42 -06009780 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009782 if (PyUnicode_IS_ASCII(string))
9783 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009784 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009785 PyUnicode_GET_LENGTH(string), keepends);
9786 else
9787 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009789 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 break;
9791 case PyUnicode_2BYTE_KIND:
9792 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009793 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 PyUnicode_GET_LENGTH(string), keepends);
9795 break;
9796 case PyUnicode_4BYTE_KIND:
9797 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009798 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 PyUnicode_GET_LENGTH(string), keepends);
9800 break;
9801 default:
9802 assert(0);
9803 list = 0;
9804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 Py_DECREF(string);
9806 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807}
9808
Alexander Belopolsky40018472011-02-26 01:02:56 +00009809static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009810split(PyObject *self,
9811 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009812 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 int kind1, kind2, kind;
9815 void *buf1, *buf2;
9816 Py_ssize_t len1, len2;
9817 PyObject* out;
9818
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009820 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 if (PyUnicode_READY(self) == -1)
9823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009826 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009828 if (PyUnicode_IS_ASCII(self))
9829 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009831 PyUnicode_GET_LENGTH(self), maxcount
9832 );
9833 else
9834 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009835 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009836 PyUnicode_GET_LENGTH(self), maxcount
9837 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 case PyUnicode_2BYTE_KIND:
9839 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009840 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 PyUnicode_GET_LENGTH(self), maxcount
9842 );
9843 case PyUnicode_4BYTE_KIND:
9844 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009845 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 PyUnicode_GET_LENGTH(self), maxcount
9847 );
9848 default:
9849 assert(0);
9850 return NULL;
9851 }
9852
9853 if (PyUnicode_READY(substring) == -1)
9854 return NULL;
9855
9856 kind1 = PyUnicode_KIND(self);
9857 kind2 = PyUnicode_KIND(substring);
9858 kind = kind1 > kind2 ? kind1 : kind2;
9859 buf1 = PyUnicode_DATA(self);
9860 buf2 = PyUnicode_DATA(substring);
9861 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009862 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 if (!buf1)
9864 return NULL;
9865 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009866 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 if (!buf2) {
9868 if (kind1 != kind) PyMem_Free(buf1);
9869 return NULL;
9870 }
9871 len1 = PyUnicode_GET_LENGTH(self);
9872 len2 = PyUnicode_GET_LENGTH(substring);
9873
Benjamin Petersonead6b532011-12-20 17:23:42 -06009874 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009876 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9877 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009878 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009879 else
9880 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009881 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 break;
9883 case PyUnicode_2BYTE_KIND:
9884 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009885 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 break;
9887 case PyUnicode_4BYTE_KIND:
9888 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009889 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 break;
9891 default:
9892 out = NULL;
9893 }
9894 if (kind1 != kind)
9895 PyMem_Free(buf1);
9896 if (kind2 != kind)
9897 PyMem_Free(buf2);
9898 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899}
9900
Alexander Belopolsky40018472011-02-26 01:02:56 +00009901static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009902rsplit(PyObject *self,
9903 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009904 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009905{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 int kind1, kind2, kind;
9907 void *buf1, *buf2;
9908 Py_ssize_t len1, len2;
9909 PyObject* out;
9910
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009911 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009912 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 if (PyUnicode_READY(self) == -1)
9915 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009918 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009920 if (PyUnicode_IS_ASCII(self))
9921 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009923 PyUnicode_GET_LENGTH(self), maxcount
9924 );
9925 else
9926 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009928 PyUnicode_GET_LENGTH(self), maxcount
9929 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 case PyUnicode_2BYTE_KIND:
9931 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009932 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 PyUnicode_GET_LENGTH(self), maxcount
9934 );
9935 case PyUnicode_4BYTE_KIND:
9936 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009937 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 PyUnicode_GET_LENGTH(self), maxcount
9939 );
9940 default:
9941 assert(0);
9942 return NULL;
9943 }
9944
9945 if (PyUnicode_READY(substring) == -1)
9946 return NULL;
9947
9948 kind1 = PyUnicode_KIND(self);
9949 kind2 = PyUnicode_KIND(substring);
9950 kind = kind1 > kind2 ? kind1 : kind2;
9951 buf1 = PyUnicode_DATA(self);
9952 buf2 = PyUnicode_DATA(substring);
9953 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009954 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 if (!buf1)
9956 return NULL;
9957 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009958 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 if (!buf2) {
9960 if (kind1 != kind) PyMem_Free(buf1);
9961 return NULL;
9962 }
9963 len1 = PyUnicode_GET_LENGTH(self);
9964 len2 = PyUnicode_GET_LENGTH(substring);
9965
Benjamin Petersonead6b532011-12-20 17:23:42 -06009966 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009968 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9969 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009970 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009971 else
9972 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009973 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 break;
9975 case PyUnicode_2BYTE_KIND:
9976 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009977 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 break;
9979 case PyUnicode_4BYTE_KIND:
9980 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009981 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 break;
9983 default:
9984 out = NULL;
9985 }
9986 if (kind1 != kind)
9987 PyMem_Free(buf1);
9988 if (kind2 != kind)
9989 PyMem_Free(buf2);
9990 return out;
9991}
9992
9993static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9995 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009997 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009999 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10000 return asciilib_find(buf1, len1, buf2, len2, offset);
10001 else
10002 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 case PyUnicode_2BYTE_KIND:
10004 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10005 case PyUnicode_4BYTE_KIND:
10006 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10007 }
10008 assert(0);
10009 return -1;
10010}
10011
10012static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010013anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10014 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010016 switch (kind) {
10017 case PyUnicode_1BYTE_KIND:
10018 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10019 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10020 else
10021 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10022 case PyUnicode_2BYTE_KIND:
10023 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10024 case PyUnicode_4BYTE_KIND:
10025 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10026 }
10027 assert(0);
10028 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010029}
10030
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010031static void
10032replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10033 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10034{
10035 int kind = PyUnicode_KIND(u);
10036 void *data = PyUnicode_DATA(u);
10037 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10038 if (kind == PyUnicode_1BYTE_KIND) {
10039 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10040 (Py_UCS1 *)data + len,
10041 u1, u2, maxcount);
10042 }
10043 else if (kind == PyUnicode_2BYTE_KIND) {
10044 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10045 (Py_UCS2 *)data + len,
10046 u1, u2, maxcount);
10047 }
10048 else {
10049 assert(kind == PyUnicode_4BYTE_KIND);
10050 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10051 (Py_UCS4 *)data + len,
10052 u1, u2, maxcount);
10053 }
10054}
10055
Alexander Belopolsky40018472011-02-26 01:02:56 +000010056static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057replace(PyObject *self, PyObject *str1,
10058 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 PyObject *u;
10061 char *sbuf = PyUnicode_DATA(self);
10062 char *buf1 = PyUnicode_DATA(str1);
10063 char *buf2 = PyUnicode_DATA(str2);
10064 int srelease = 0, release1 = 0, release2 = 0;
10065 int skind = PyUnicode_KIND(self);
10066 int kind1 = PyUnicode_KIND(str1);
10067 int kind2 = PyUnicode_KIND(str2);
10068 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10069 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10070 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010071 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010072 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010073
10074 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010075 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010077 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078
Victor Stinner59de0ee2011-10-07 10:01:28 +020010079 if (str1 == str2)
10080 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081
Victor Stinner49a0a212011-10-12 23:46:10 +020010082 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010083 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10084 if (maxchar < maxchar_str1)
10085 /* substring too wide to be present */
10086 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010087 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10088 /* Replacing str1 with str2 may cause a maxchar reduction in the
10089 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010090 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010091 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010094 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010096 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010098 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010099 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010100 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010101
Victor Stinner69ed0f42013-04-09 21:48:24 +020010102 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010103 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010104 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010105 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010106 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010108 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010110
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010111 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10112 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010113 }
10114 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 int rkind = skind;
10116 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010117 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (kind1 < rkind) {
10120 /* widen substring */
10121 buf1 = _PyUnicode_AsKind(str1, rkind);
10122 if (!buf1) goto error;
10123 release1 = 1;
10124 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010125 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010126 if (i < 0)
10127 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 if (rkind > kind2) {
10129 /* widen replacement */
10130 buf2 = _PyUnicode_AsKind(str2, rkind);
10131 if (!buf2) goto error;
10132 release2 = 1;
10133 }
10134 else if (rkind < kind2) {
10135 /* widen self and buf1 */
10136 rkind = kind2;
10137 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010138 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 sbuf = _PyUnicode_AsKind(self, rkind);
10140 if (!sbuf) goto error;
10141 srelease = 1;
10142 buf1 = _PyUnicode_AsKind(str1, rkind);
10143 if (!buf1) goto error;
10144 release1 = 1;
10145 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010146 u = PyUnicode_New(slen, maxchar);
10147 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010149 assert(PyUnicode_KIND(u) == rkind);
10150 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010151
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010152 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010153 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010154 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010156 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010158
10159 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010160 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010161 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010162 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010163 if (i == -1)
10164 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010165 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010167 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010171 }
10172 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010174 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 int rkind = skind;
10176 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010179 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 buf1 = _PyUnicode_AsKind(str1, rkind);
10181 if (!buf1) goto error;
10182 release1 = 1;
10183 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010184 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010185 if (n == 0)
10186 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010188 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 buf2 = _PyUnicode_AsKind(str2, rkind);
10190 if (!buf2) goto error;
10191 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010194 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 rkind = kind2;
10196 sbuf = _PyUnicode_AsKind(self, rkind);
10197 if (!sbuf) goto error;
10198 srelease = 1;
10199 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010200 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 buf1 = _PyUnicode_AsKind(str1, rkind);
10202 if (!buf1) goto error;
10203 release1 = 1;
10204 }
10205 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10206 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010207 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 PyErr_SetString(PyExc_OverflowError,
10209 "replace string is too long");
10210 goto error;
10211 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010212 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010213 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010214 _Py_INCREF_UNICODE_EMPTY();
10215 if (!unicode_empty)
10216 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010217 u = unicode_empty;
10218 goto done;
10219 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010220 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 PyErr_SetString(PyExc_OverflowError,
10222 "replace string is too long");
10223 goto error;
10224 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010225 u = PyUnicode_New(new_size, maxchar);
10226 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010228 assert(PyUnicode_KIND(u) == rkind);
10229 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 ires = i = 0;
10231 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010232 while (n-- > 0) {
10233 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010234 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010235 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010236 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010237 if (j == -1)
10238 break;
10239 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010240 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010241 memcpy(res + rkind * ires,
10242 sbuf + rkind * i,
10243 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010245 }
10246 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010248 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010256 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010257 memcpy(res + rkind * ires,
10258 sbuf + rkind * i,
10259 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010260 }
10261 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010262 /* interleave */
10263 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010264 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010266 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010268 if (--n <= 0)
10269 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010270 memcpy(res + rkind * ires,
10271 sbuf + rkind * i,
10272 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 ires++;
10274 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010275 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010276 memcpy(res + rkind * ires,
10277 sbuf + rkind * i,
10278 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010280 }
10281
10282 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010283 unicode_adjust_maxchar(&u);
10284 if (u == NULL)
10285 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010287
10288 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 if (srelease)
10290 PyMem_FREE(sbuf);
10291 if (release1)
10292 PyMem_FREE(buf1);
10293 if (release2)
10294 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010295 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010297
Benjamin Peterson29060642009-01-31 22:14:21 +000010298 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010299 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 if (srelease)
10301 PyMem_FREE(sbuf);
10302 if (release1)
10303 PyMem_FREE(buf1);
10304 if (release2)
10305 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010306 return unicode_result_unchanged(self);
10307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 error:
10309 if (srelease && sbuf)
10310 PyMem_FREE(sbuf);
10311 if (release1 && buf1)
10312 PyMem_FREE(buf1);
10313 if (release2 && buf2)
10314 PyMem_FREE(buf2);
10315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316}
10317
10318/* --- Unicode Object Methods --------------------------------------------- */
10319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010320PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010321 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322\n\
10323Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010324characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325
10326static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010327unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010329 if (PyUnicode_READY(self) == -1)
10330 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010331 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332}
10333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010334PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336\n\
10337Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010338have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339
10340static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010341unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010343 if (PyUnicode_READY(self) == -1)
10344 return NULL;
10345 if (PyUnicode_GET_LENGTH(self) == 0)
10346 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010347 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348}
10349
Benjamin Petersond5890c82012-01-14 13:23:30 -050010350PyDoc_STRVAR(casefold__doc__,
10351 "S.casefold() -> str\n\
10352\n\
10353Return a version of S suitable for caseless comparisons.");
10354
10355static PyObject *
10356unicode_casefold(PyObject *self)
10357{
10358 if (PyUnicode_READY(self) == -1)
10359 return NULL;
10360 if (PyUnicode_IS_ASCII(self))
10361 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010362 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010363}
10364
10365
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010366/* Argument converter. Coerces to a single unicode character */
10367
10368static int
10369convert_uc(PyObject *obj, void *addr)
10370{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010372 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010373
Benjamin Peterson14339b62009-01-31 16:36:08 +000010374 uniobj = PyUnicode_FromObject(obj);
10375 if (uniobj == NULL) {
10376 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010377 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010378 return 0;
10379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010381 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010382 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010383 Py_DECREF(uniobj);
10384 return 0;
10385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010387 Py_DECREF(uniobj);
10388 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010389}
10390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010391PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010392 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010394Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010395done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396
10397static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010398unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010400 Py_ssize_t marg, left;
10401 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 Py_UCS4 fillchar = ' ';
10403
Victor Stinnere9a29352011-10-01 02:14:59 +020010404 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406
Benjamin Petersonbac79492012-01-14 13:34:47 -050010407 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408 return NULL;
10409
Victor Stinnerc4b49542011-12-11 22:44:26 +010010410 if (PyUnicode_GET_LENGTH(self) >= width)
10411 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412
Victor Stinnerc4b49542011-12-11 22:44:26 +010010413 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414 left = marg / 2 + (marg & width & 1);
10415
Victor Stinner9310abb2011-10-05 00:59:23 +020010416 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417}
10418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419/* This function assumes that str1 and str2 are readied by the caller. */
10420
Marc-André Lemburge5034372000-08-08 08:04:29 +000010421static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010422unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010423{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010424#define COMPARE(TYPE1, TYPE2) \
10425 do { \
10426 TYPE1* p1 = (TYPE1 *)data1; \
10427 TYPE2* p2 = (TYPE2 *)data2; \
10428 TYPE1* end = p1 + len; \
10429 Py_UCS4 c1, c2; \
10430 for (; p1 != end; p1++, p2++) { \
10431 c1 = *p1; \
10432 c2 = *p2; \
10433 if (c1 != c2) \
10434 return (c1 < c2) ? -1 : 1; \
10435 } \
10436 } \
10437 while (0)
10438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 int kind1, kind2;
10440 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010441 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 kind1 = PyUnicode_KIND(str1);
10444 kind2 = PyUnicode_KIND(str2);
10445 data1 = PyUnicode_DATA(str1);
10446 data2 = PyUnicode_DATA(str2);
10447 len1 = PyUnicode_GET_LENGTH(str1);
10448 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010449 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010450
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010451 switch(kind1) {
10452 case PyUnicode_1BYTE_KIND:
10453 {
10454 switch(kind2) {
10455 case PyUnicode_1BYTE_KIND:
10456 {
10457 int cmp = memcmp(data1, data2, len);
10458 /* normalize result of memcmp() into the range [-1; 1] */
10459 if (cmp < 0)
10460 return -1;
10461 if (cmp > 0)
10462 return 1;
10463 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010464 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010465 case PyUnicode_2BYTE_KIND:
10466 COMPARE(Py_UCS1, Py_UCS2);
10467 break;
10468 case PyUnicode_4BYTE_KIND:
10469 COMPARE(Py_UCS1, Py_UCS4);
10470 break;
10471 default:
10472 assert(0);
10473 }
10474 break;
10475 }
10476 case PyUnicode_2BYTE_KIND:
10477 {
10478 switch(kind2) {
10479 case PyUnicode_1BYTE_KIND:
10480 COMPARE(Py_UCS2, Py_UCS1);
10481 break;
10482 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010483 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010484 COMPARE(Py_UCS2, Py_UCS2);
10485 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010486 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010487 case PyUnicode_4BYTE_KIND:
10488 COMPARE(Py_UCS2, Py_UCS4);
10489 break;
10490 default:
10491 assert(0);
10492 }
10493 break;
10494 }
10495 case PyUnicode_4BYTE_KIND:
10496 {
10497 switch(kind2) {
10498 case PyUnicode_1BYTE_KIND:
10499 COMPARE(Py_UCS4, Py_UCS1);
10500 break;
10501 case PyUnicode_2BYTE_KIND:
10502 COMPARE(Py_UCS4, Py_UCS2);
10503 break;
10504 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010505 {
10506#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10507 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10508 /* normalize result of wmemcmp() into the range [-1; 1] */
10509 if (cmp < 0)
10510 return -1;
10511 if (cmp > 0)
10512 return 1;
10513#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010514 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010515#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010516 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010517 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010518 default:
10519 assert(0);
10520 }
10521 break;
10522 }
10523 default:
10524 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010525 }
10526
Victor Stinner770e19e2012-10-04 22:59:45 +020010527 if (len1 == len2)
10528 return 0;
10529 if (len1 < len2)
10530 return -1;
10531 else
10532 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010533
10534#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010535}
10536
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010537Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010538unicode_compare_eq(PyObject *str1, PyObject *str2)
10539{
10540 int kind;
10541 void *data1, *data2;
10542 Py_ssize_t len;
10543 int cmp;
10544
Victor Stinnere5567ad2012-10-23 02:48:49 +020010545 len = PyUnicode_GET_LENGTH(str1);
10546 if (PyUnicode_GET_LENGTH(str2) != len)
10547 return 0;
10548 kind = PyUnicode_KIND(str1);
10549 if (PyUnicode_KIND(str2) != kind)
10550 return 0;
10551 data1 = PyUnicode_DATA(str1);
10552 data2 = PyUnicode_DATA(str2);
10553
10554 cmp = memcmp(data1, data2, len * kind);
10555 return (cmp == 0);
10556}
10557
10558
Alexander Belopolsky40018472011-02-26 01:02:56 +000010559int
10560PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10563 if (PyUnicode_READY(left) == -1 ||
10564 PyUnicode_READY(right) == -1)
10565 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010566
10567 /* a string is equal to itself */
10568 if (left == right)
10569 return 0;
10570
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010571 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010573 PyErr_Format(PyExc_TypeError,
10574 "Can't compare %.100s and %.100s",
10575 left->ob_type->tp_name,
10576 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577 return -1;
10578}
10579
Martin v. Löwis5b222132007-06-10 09:51:05 +000010580int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010581_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10582{
10583 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10584 if (right_str == NULL)
10585 return -1;
10586 return PyUnicode_Compare(left, right_str);
10587}
10588
10589int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010590PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 Py_ssize_t i;
10593 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 Py_UCS4 chr;
10595
Victor Stinner910337b2011-10-03 03:20:16 +020010596 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (PyUnicode_READY(uni) == -1)
10598 return -1;
10599 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010600 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010601 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010602 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010603 size_t len, len2 = strlen(str);
10604 int cmp;
10605
10606 len = Py_MIN(len1, len2);
10607 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010608 if (cmp != 0) {
10609 if (cmp < 0)
10610 return -1;
10611 else
10612 return 1;
10613 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010614 if (len1 > len2)
10615 return 1; /* uni is longer */
10616 if (len2 > len1)
10617 return -1; /* str is longer */
10618 return 0;
10619 }
10620 else {
10621 void *data = PyUnicode_DATA(uni);
10622 /* Compare Unicode string and source character set string */
10623 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10624 if (chr != str[i])
10625 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10626 /* This check keeps Python strings that end in '\0' from comparing equal
10627 to C strings identical up to that point. */
10628 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10629 return 1; /* uni is longer */
10630 if (str[i])
10631 return -1; /* str is longer */
10632 return 0;
10633 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010634}
10635
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010636
Benjamin Peterson29060642009-01-31 22:14:21 +000010637#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010638 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010639
Alexander Belopolsky40018472011-02-26 01:02:56 +000010640PyObject *
10641PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010642{
10643 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010644 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010645
Victor Stinnere5567ad2012-10-23 02:48:49 +020010646 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10647 Py_RETURN_NOTIMPLEMENTED;
10648
10649 if (PyUnicode_READY(left) == -1 ||
10650 PyUnicode_READY(right) == -1)
10651 return NULL;
10652
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010653 if (left == right) {
10654 switch (op) {
10655 case Py_EQ:
10656 case Py_LE:
10657 case Py_GE:
10658 /* a string is equal to itself */
10659 v = Py_True;
10660 break;
10661 case Py_NE:
10662 case Py_LT:
10663 case Py_GT:
10664 v = Py_False;
10665 break;
10666 default:
10667 PyErr_BadArgument();
10668 return NULL;
10669 }
10670 }
10671 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010672 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010673 result ^= (op == Py_NE);
10674 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010675 }
10676 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010677 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010678
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010679 /* Convert the return value to a Boolean */
10680 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010681 case Py_LE:
10682 v = TEST_COND(result <= 0);
10683 break;
10684 case Py_GE:
10685 v = TEST_COND(result >= 0);
10686 break;
10687 case Py_LT:
10688 v = TEST_COND(result == -1);
10689 break;
10690 case Py_GT:
10691 v = TEST_COND(result == 1);
10692 break;
10693 default:
10694 PyErr_BadArgument();
10695 return NULL;
10696 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010697 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010698 Py_INCREF(v);
10699 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010700}
10701
Alexander Belopolsky40018472011-02-26 01:02:56 +000010702int
10703PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010704{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010705 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010706 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 void *buf1, *buf2;
10708 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010709 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010710
10711 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010712 sub = PyUnicode_FromObject(element);
10713 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010714 PyErr_Format(PyExc_TypeError,
10715 "'in <string>' requires string as left operand, not %s",
10716 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010717 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010718 }
10719
Thomas Wouters477c8d52006-05-27 19:21:47 +000010720 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010721 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722 Py_DECREF(sub);
10723 return -1;
10724 }
10725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 kind1 = PyUnicode_KIND(str);
10727 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 buf1 = PyUnicode_DATA(str);
10729 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010730 if (kind2 != kind1) {
10731 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010732 Py_DECREF(sub);
10733 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010734 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010735 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010736 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 if (!buf2) {
10739 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010740 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 return -1;
10742 }
10743 len1 = PyUnicode_GET_LENGTH(str);
10744 len2 = PyUnicode_GET_LENGTH(sub);
10745
Victor Stinner77282cb2013-04-14 19:22:47 +020010746 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 case PyUnicode_1BYTE_KIND:
10748 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10749 break;
10750 case PyUnicode_2BYTE_KIND:
10751 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10752 break;
10753 case PyUnicode_4BYTE_KIND:
10754 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10755 break;
10756 default:
10757 result = -1;
10758 assert(0);
10759 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010760
10761 Py_DECREF(str);
10762 Py_DECREF(sub);
10763
Victor Stinner77282cb2013-04-14 19:22:47 +020010764 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 PyMem_Free(buf2);
10766
Guido van Rossum403d68b2000-03-13 15:55:09 +000010767 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010768}
10769
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770/* Concat to string or Unicode object giving a new Unicode object. */
10771
Alexander Belopolsky40018472011-02-26 01:02:56 +000010772PyObject *
10773PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010776 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010777 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778
10779 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010782 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786
10787 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010788 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010789 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010792 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010793 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795 }
10796
Victor Stinner488fa492011-12-12 00:01:39 +010010797 u_len = PyUnicode_GET_LENGTH(u);
10798 v_len = PyUnicode_GET_LENGTH(v);
10799 if (u_len > PY_SSIZE_T_MAX - v_len) {
10800 PyErr_SetString(PyExc_OverflowError,
10801 "strings are too large to concat");
10802 goto onError;
10803 }
10804 new_len = u_len + v_len;
10805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010807 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010808 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010811 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010813 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010814 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10815 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816 Py_DECREF(u);
10817 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010818 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820
Benjamin Peterson29060642009-01-31 22:14:21 +000010821 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 Py_XDECREF(u);
10823 Py_XDECREF(v);
10824 return NULL;
10825}
10826
Walter Dörwald1ab83302007-05-18 17:15:44 +000010827void
Victor Stinner23e56682011-10-03 03:54:37 +020010828PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010829{
Victor Stinner23e56682011-10-03 03:54:37 +020010830 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010831 Py_UCS4 maxchar, maxchar2;
10832 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010833
10834 if (p_left == NULL) {
10835 if (!PyErr_Occurred())
10836 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010837 return;
10838 }
Victor Stinner23e56682011-10-03 03:54:37 +020010839 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020010840 if (right == NULL || left == NULL
10841 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010842 if (!PyErr_Occurred())
10843 PyErr_BadInternalCall();
10844 goto error;
10845 }
10846
Benjamin Petersonbac79492012-01-14 13:34:47 -050010847 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010848 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010849 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010850 goto error;
10851
Victor Stinner488fa492011-12-12 00:01:39 +010010852 /* Shortcuts */
10853 if (left == unicode_empty) {
10854 Py_DECREF(left);
10855 Py_INCREF(right);
10856 *p_left = right;
10857 return;
10858 }
10859 if (right == unicode_empty)
10860 return;
10861
10862 left_len = PyUnicode_GET_LENGTH(left);
10863 right_len = PyUnicode_GET_LENGTH(right);
10864 if (left_len > PY_SSIZE_T_MAX - right_len) {
10865 PyErr_SetString(PyExc_OverflowError,
10866 "strings are too large to concat");
10867 goto error;
10868 }
10869 new_len = left_len + right_len;
10870
10871 if (unicode_modifiable(left)
10872 && PyUnicode_CheckExact(right)
10873 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010874 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10875 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010876 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010877 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010878 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10879 {
10880 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010881 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010010882 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020010883
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010884 /* copy 'right' into the newly allocated area of 'left' */
10885 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010886 }
Victor Stinner488fa492011-12-12 00:01:39 +010010887 else {
10888 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10889 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010890 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010891
Victor Stinner488fa492011-12-12 00:01:39 +010010892 /* Concat the two Unicode strings */
10893 res = PyUnicode_New(new_len, maxchar);
10894 if (res == NULL)
10895 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010896 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10897 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010898 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010899 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010010900 }
10901 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010902 return;
10903
10904error:
Victor Stinner488fa492011-12-12 00:01:39 +010010905 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010906}
10907
10908void
10909PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10910{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010911 PyUnicode_Append(pleft, right);
10912 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010913}
10914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010915PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010916 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010918Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010919string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010920interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921
10922static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010923unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010925 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010926 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010927 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 int kind1, kind2, kind;
10930 void *buf1, *buf2;
10931 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932
Jesus Ceaac451502011-04-20 17:09:23 +020010933 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10934 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010935 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 kind1 = PyUnicode_KIND(self);
10938 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020010939 if (kind2 > kind1) {
10940 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010941 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020010942 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010943 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 buf1 = PyUnicode_DATA(self);
10945 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010947 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 if (!buf2) {
10949 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 return NULL;
10951 }
10952 len1 = PyUnicode_GET_LENGTH(self);
10953 len2 = PyUnicode_GET_LENGTH(substring);
10954
10955 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010956 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 case PyUnicode_1BYTE_KIND:
10958 iresult = ucs1lib_count(
10959 ((Py_UCS1*)buf1) + start, end - start,
10960 buf2, len2, PY_SSIZE_T_MAX
10961 );
10962 break;
10963 case PyUnicode_2BYTE_KIND:
10964 iresult = ucs2lib_count(
10965 ((Py_UCS2*)buf1) + start, end - start,
10966 buf2, len2, PY_SSIZE_T_MAX
10967 );
10968 break;
10969 case PyUnicode_4BYTE_KIND:
10970 iresult = ucs4lib_count(
10971 ((Py_UCS4*)buf1) + start, end - start,
10972 buf2, len2, PY_SSIZE_T_MAX
10973 );
10974 break;
10975 default:
10976 assert(0); iresult = 0;
10977 }
10978
10979 result = PyLong_FromSsize_t(iresult);
10980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 if (kind2 != kind)
10982 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
10984 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010985
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 return result;
10987}
10988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010989PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010990 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010992Encode S using the codec registered for encoding. Default encoding\n\
10993is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010994handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010995a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10996'xmlcharrefreplace' as well as any other name registered with\n\
10997codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998
10999static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011000unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011002 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 char *encoding = NULL;
11004 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011005
Benjamin Peterson308d6372009-09-18 21:42:35 +000011006 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11007 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011009 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011010}
11011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011012PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011013 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014\n\
11015Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011016If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017
11018static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011019unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011021 Py_ssize_t i, j, line_pos, src_len, incr;
11022 Py_UCS4 ch;
11023 PyObject *u;
11024 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011025 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011027 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011028 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029
Ezio Melotti745d54d2013-11-16 19:10:57 +020011030 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11031 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033
Antoine Pitrou22425222011-10-04 19:10:51 +020011034 if (PyUnicode_READY(self) == -1)
11035 return NULL;
11036
Thomas Wouters7e474022000-07-16 12:04:32 +000011037 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011038 src_len = PyUnicode_GET_LENGTH(self);
11039 i = j = line_pos = 0;
11040 kind = PyUnicode_KIND(self);
11041 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011042 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011043 for (; i < src_len; i++) {
11044 ch = PyUnicode_READ(kind, src_data, i);
11045 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011046 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011047 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011048 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011049 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011050 goto overflow;
11051 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011053 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011054 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011057 goto overflow;
11058 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011060 if (ch == '\n' || ch == '\r')
11061 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011063 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011064 if (!found)
11065 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011066
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011068 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069 if (!u)
11070 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011071 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072
Antoine Pitroue71d5742011-10-04 15:55:09 +020011073 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074
Antoine Pitroue71d5742011-10-04 15:55:09 +020011075 for (; i < src_len; i++) {
11076 ch = PyUnicode_READ(kind, src_data, i);
11077 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011078 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011079 incr = tabsize - (line_pos % tabsize);
11080 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011081 FILL(kind, dest_data, ' ', j, incr);
11082 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011083 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011084 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011085 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011086 line_pos++;
11087 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011088 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011089 if (ch == '\n' || ch == '\r')
11090 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011092 }
11093 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011094 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011095
Antoine Pitroue71d5742011-10-04 15:55:09 +020011096 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011097 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099}
11100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011101PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103\n\
11104Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011105such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106arguments start and end are interpreted as in slice notation.\n\
11107\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011108Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
11110static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011113 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011114 Py_ssize_t start;
11115 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011116 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117
Jesus Ceaac451502011-04-20 17:09:23 +020011118 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11119 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121
Christian Heimesd47802e2013-06-29 21:33:36 +020011122 if (PyUnicode_READY(self) == -1) {
11123 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011125 }
11126 if (PyUnicode_READY(substring) == -1) {
11127 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130
Victor Stinner7931d9a2011-11-04 00:22:48 +010011131 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132
11133 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 if (result == -2)
11136 return NULL;
11137
Christian Heimes217cfd12007-12-02 14:31:20 +000011138 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139}
11140
11141static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011142unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011144 void *data;
11145 enum PyUnicode_Kind kind;
11146 Py_UCS4 ch;
11147 PyObject *res;
11148
11149 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11150 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011152 }
11153 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11154 PyErr_SetString(PyExc_IndexError, "string index out of range");
11155 return NULL;
11156 }
11157 kind = PyUnicode_KIND(self);
11158 data = PyUnicode_DATA(self);
11159 ch = PyUnicode_READ(kind, data, index);
11160 if (ch < 256)
11161 return get_latin1_char(ch);
11162
11163 res = PyUnicode_New(1, ch);
11164 if (res == NULL)
11165 return NULL;
11166 kind = PyUnicode_KIND(res);
11167 data = PyUnicode_DATA(res);
11168 PyUnicode_WRITE(kind, data, 0, ch);
11169 assert(_PyUnicode_CheckConsistency(res, 1));
11170 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171}
11172
Guido van Rossumc2504932007-09-18 19:42:40 +000011173/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011174 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011175static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011176unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177{
Guido van Rossumc2504932007-09-18 19:42:40 +000011178 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011179 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011180
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011181#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011182 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011183#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 if (_PyUnicode_HASH(self) != -1)
11185 return _PyUnicode_HASH(self);
11186 if (PyUnicode_READY(self) == -1)
11187 return -1;
11188 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011189 /*
11190 We make the hash of the empty string be 0, rather than using
11191 (prefix ^ suffix), since this slightly obfuscates the hash secret
11192 */
11193 if (len == 0) {
11194 _PyUnicode_HASH(self) = 0;
11195 return 0;
11196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197
11198 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011199#define HASH(P) \
11200 x ^= (Py_uhash_t) *P << 7; \
11201 while (--len >= 0) \
11202 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203
Georg Brandl2fb477c2012-02-21 00:33:36 +010011204 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 switch (PyUnicode_KIND(self)) {
11206 case PyUnicode_1BYTE_KIND: {
11207 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11208 HASH(c);
11209 break;
11210 }
11211 case PyUnicode_2BYTE_KIND: {
11212 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11213 HASH(s);
11214 break;
11215 }
11216 default: {
11217 Py_UCS4 *l;
11218 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11219 "Impossible switch case in unicode_hash");
11220 l = PyUnicode_4BYTE_DATA(self);
11221 HASH(l);
11222 break;
11223 }
11224 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011225 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11226 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227
Guido van Rossumc2504932007-09-18 19:42:40 +000011228 if (x == -1)
11229 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011231 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011235PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011238Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239
11240static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011243 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011244 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011245 Py_ssize_t start;
11246 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
Jesus Ceaac451502011-04-20 17:09:23 +020011248 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11249 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
Christian Heimesd47a0452013-06-29 21:21:37 +020011252 if (PyUnicode_READY(self) == -1) {
11253 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011255 }
11256 if (PyUnicode_READY(substring) == -1) {
11257 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260
Victor Stinner7931d9a2011-11-04 00:22:48 +010011261 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
11263 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (result == -2)
11266 return NULL;
11267
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 if (result < 0) {
11269 PyErr_SetString(PyExc_ValueError, "substring not found");
11270 return NULL;
11271 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011272
Christian Heimes217cfd12007-12-02 14:31:20 +000011273 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274}
11275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011276PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011277 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011279Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011280at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
11282static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011283unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 Py_ssize_t i, length;
11286 int kind;
11287 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288 int cased;
11289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 if (PyUnicode_READY(self) == -1)
11291 return NULL;
11292 length = PyUnicode_GET_LENGTH(self);
11293 kind = PyUnicode_KIND(self);
11294 data = PyUnicode_DATA(self);
11295
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 if (length == 1)
11298 return PyBool_FromLong(
11299 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011301 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011304
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 for (i = 0; i < length; i++) {
11307 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011308
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11310 return PyBool_FromLong(0);
11311 else if (!cased && Py_UNICODE_ISLOWER(ch))
11312 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011314 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315}
11316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011317PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011318 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011320Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011321at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
11323static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011324unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 Py_ssize_t i, length;
11327 int kind;
11328 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 int cased;
11330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 if (PyUnicode_READY(self) == -1)
11332 return NULL;
11333 length = PyUnicode_GET_LENGTH(self);
11334 kind = PyUnicode_KIND(self);
11335 data = PyUnicode_DATA(self);
11336
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (length == 1)
11339 return PyBool_FromLong(
11340 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011342 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011345
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 for (i = 0; i < length; i++) {
11348 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011349
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11351 return PyBool_FromLong(0);
11352 else if (!cased && Py_UNICODE_ISUPPER(ch))
11353 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011355 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356}
11357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011358PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011359 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011361Return True if S is a titlecased string and there is at least one\n\
11362character in S, i.e. upper- and titlecase characters may only\n\
11363follow uncased characters and lowercase characters only cased ones.\n\
11364Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365
11366static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011367unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 Py_ssize_t i, length;
11370 int kind;
11371 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 int cased, previous_is_cased;
11373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (PyUnicode_READY(self) == -1)
11375 return NULL;
11376 length = PyUnicode_GET_LENGTH(self);
11377 kind = PyUnicode_KIND(self);
11378 data = PyUnicode_DATA(self);
11379
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 if (length == 1) {
11382 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11383 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11384 (Py_UNICODE_ISUPPER(ch) != 0));
11385 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011387 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011390
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391 cased = 0;
11392 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 for (i = 0; i < length; i++) {
11394 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011395
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11397 if (previous_is_cased)
11398 return PyBool_FromLong(0);
11399 previous_is_cased = 1;
11400 cased = 1;
11401 }
11402 else if (Py_UNICODE_ISLOWER(ch)) {
11403 if (!previous_is_cased)
11404 return PyBool_FromLong(0);
11405 previous_is_cased = 1;
11406 cased = 1;
11407 }
11408 else
11409 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011411 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011417Return True if all characters in S are whitespace\n\
11418and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
11420static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011421unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 Py_ssize_t i, length;
11424 int kind;
11425 void *data;
11426
11427 if (PyUnicode_READY(self) == -1)
11428 return NULL;
11429 length = PyUnicode_GET_LENGTH(self);
11430 kind = PyUnicode_KIND(self);
11431 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 if (length == 1)
11435 return PyBool_FromLong(
11436 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011438 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 for (i = 0; i < length; i++) {
11443 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011444 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011447 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448}
11449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011450PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011452\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011453Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011454and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011455
11456static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011457unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011458{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 Py_ssize_t i, length;
11460 int kind;
11461 void *data;
11462
11463 if (PyUnicode_READY(self) == -1)
11464 return NULL;
11465 length = PyUnicode_GET_LENGTH(self);
11466 kind = PyUnicode_KIND(self);
11467 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011468
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011469 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 if (length == 1)
11471 return PyBool_FromLong(
11472 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011473
11474 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 for (i = 0; i < length; i++) {
11479 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011481 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011482 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011483}
11484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011485PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011487\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011488Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011489and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011490
11491static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011492unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 int kind;
11495 void *data;
11496 Py_ssize_t len, i;
11497
11498 if (PyUnicode_READY(self) == -1)
11499 return NULL;
11500
11501 kind = PyUnicode_KIND(self);
11502 data = PyUnicode_DATA(self);
11503 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011504
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011505 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 if (len == 1) {
11507 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11508 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11509 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011510
11511 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 for (i = 0; i < len; i++) {
11516 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011517 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011519 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011520 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011521}
11522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011523PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011526Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011527False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
11529static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011530unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 Py_ssize_t i, length;
11533 int kind;
11534 void *data;
11535
11536 if (PyUnicode_READY(self) == -1)
11537 return NULL;
11538 length = PyUnicode_GET_LENGTH(self);
11539 kind = PyUnicode_KIND(self);
11540 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 if (length == 1)
11544 return PyBool_FromLong(
11545 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011547 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 for (i = 0; i < length; i++) {
11552 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011555 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556}
11557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011558PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011561Return True if all characters in S are digits\n\
11562and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563
11564static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011565unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 Py_ssize_t i, length;
11568 int kind;
11569 void *data;
11570
11571 if (PyUnicode_READY(self) == -1)
11572 return NULL;
11573 length = PyUnicode_GET_LENGTH(self);
11574 kind = PyUnicode_KIND(self);
11575 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 if (length == 1) {
11579 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11580 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011583 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 for (i = 0; i < length; i++) {
11588 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011591 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592}
11593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011594PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011597Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011598False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599
11600static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011601unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 Py_ssize_t i, length;
11604 int kind;
11605 void *data;
11606
11607 if (PyUnicode_READY(self) == -1)
11608 return NULL;
11609 length = PyUnicode_GET_LENGTH(self);
11610 kind = PyUnicode_KIND(self);
11611 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 if (length == 1)
11615 return PyBool_FromLong(
11616 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011618 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011620 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 for (i = 0; i < length; i++) {
11623 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011626 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627}
11628
Martin v. Löwis47383402007-08-15 07:32:56 +000011629int
11630PyUnicode_IsIdentifier(PyObject *self)
11631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 int kind;
11633 void *data;
11634 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011635 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (PyUnicode_READY(self) == -1) {
11638 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 }
11641
11642 /* Special case for empty strings */
11643 if (PyUnicode_GET_LENGTH(self) == 0)
11644 return 0;
11645 kind = PyUnicode_KIND(self);
11646 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011647
11648 /* PEP 3131 says that the first character must be in
11649 XID_Start and subsequent characters in XID_Continue,
11650 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011651 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011652 letters, digits, underscore). However, given the current
11653 definition of XID_Start and XID_Continue, it is sufficient
11654 to check just for these, except that _ must be allowed
11655 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011657 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011658 return 0;
11659
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011660 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011662 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011663 return 1;
11664}
11665
11666PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011668\n\
11669Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011670to the language definition.\n\
11671\n\
11672Use keyword.iskeyword() to test for reserved identifiers\n\
11673such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011674
11675static PyObject*
11676unicode_isidentifier(PyObject *self)
11677{
11678 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11679}
11680
Georg Brandl559e5d72008-06-11 18:37:52 +000011681PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011682 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011683\n\
11684Return True if all characters in S are considered\n\
11685printable in repr() or S is empty, False otherwise.");
11686
11687static PyObject*
11688unicode_isprintable(PyObject *self)
11689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 Py_ssize_t i, length;
11691 int kind;
11692 void *data;
11693
11694 if (PyUnicode_READY(self) == -1)
11695 return NULL;
11696 length = PyUnicode_GET_LENGTH(self);
11697 kind = PyUnicode_KIND(self);
11698 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011699
11700 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 if (length == 1)
11702 return PyBool_FromLong(
11703 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 for (i = 0; i < length; i++) {
11706 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011707 Py_RETURN_FALSE;
11708 }
11709 }
11710 Py_RETURN_TRUE;
11711}
11712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011713PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011714 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715\n\
11716Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011717iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718
11719static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011720unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011722 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723}
11724
Martin v. Löwis18e16552006-02-15 17:27:45 +000011725static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011726unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (PyUnicode_READY(self) == -1)
11729 return -1;
11730 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731}
11732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011733PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011736Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011737done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
11739static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011740unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011742 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 Py_UCS4 fillchar = ' ';
11744
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011745 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 return NULL;
11747
Benjamin Petersonbac79492012-01-14 13:34:47 -050011748 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750
Victor Stinnerc4b49542011-12-11 22:44:26 +010011751 if (PyUnicode_GET_LENGTH(self) >= width)
11752 return unicode_result_unchanged(self);
11753
11754 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755}
11756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011760Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761
11762static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011763unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011765 if (PyUnicode_READY(self) == -1)
11766 return NULL;
11767 if (PyUnicode_IS_ASCII(self))
11768 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011769 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770}
11771
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011772#define LEFTSTRIP 0
11773#define RIGHTSTRIP 1
11774#define BOTHSTRIP 2
11775
11776/* Arrays indexed by above */
11777static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11778
11779#define STRIPNAME(i) (stripformat[i]+3)
11780
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011781/* externally visible for str.strip(unicode) */
11782PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011783_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 void *data;
11786 int kind;
11787 Py_ssize_t i, j, len;
11788 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011789 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11792 return NULL;
11793
11794 kind = PyUnicode_KIND(self);
11795 data = PyUnicode_DATA(self);
11796 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011797 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11799 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011800 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011801
Benjamin Peterson14339b62009-01-31 16:36:08 +000011802 i = 0;
11803 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011804 while (i < len) {
11805 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11806 if (!BLOOM(sepmask, ch))
11807 break;
11808 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11809 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 i++;
11811 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011812 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011813
Benjamin Peterson14339b62009-01-31 16:36:08 +000011814 j = len;
11815 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011816 j--;
11817 while (j >= i) {
11818 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11819 if (!BLOOM(sepmask, ch))
11820 break;
11821 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11822 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011824 }
11825
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011827 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011828
Victor Stinner7931d9a2011-11-04 00:22:48 +010011829 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830}
11831
11832PyObject*
11833PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11834{
11835 unsigned char *data;
11836 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011837 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838
Victor Stinnerde636f32011-10-01 03:55:54 +020011839 if (PyUnicode_READY(self) == -1)
11840 return NULL;
11841
Victor Stinner684d5fd2012-05-03 02:32:34 +020011842 length = PyUnicode_GET_LENGTH(self);
11843 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011844
Victor Stinner684d5fd2012-05-03 02:32:34 +020011845 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011846 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847
Victor Stinnerde636f32011-10-01 03:55:54 +020011848 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011849 PyErr_SetString(PyExc_IndexError, "string index out of range");
11850 return NULL;
11851 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011852 if (start >= length || end < start)
11853 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011854
Victor Stinner684d5fd2012-05-03 02:32:34 +020011855 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011856 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011857 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011858 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011859 }
11860 else {
11861 kind = PyUnicode_KIND(self);
11862 data = PyUnicode_1BYTE_DATA(self);
11863 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011864 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011865 length);
11866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868
11869static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011870do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 Py_ssize_t len, i, j;
11873
11874 if (PyUnicode_READY(self) == -1)
11875 return NULL;
11876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011878
Victor Stinnercc7af722013-04-09 22:39:24 +020011879 if (PyUnicode_IS_ASCII(self)) {
11880 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11881
11882 i = 0;
11883 if (striptype != RIGHTSTRIP) {
11884 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011885 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020011886 if (!_Py_ascii_whitespace[ch])
11887 break;
11888 i++;
11889 }
11890 }
11891
11892 j = len;
11893 if (striptype != LEFTSTRIP) {
11894 j--;
11895 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011896 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020011897 if (!_Py_ascii_whitespace[ch])
11898 break;
11899 j--;
11900 }
11901 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011902 }
11903 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011904 else {
11905 int kind = PyUnicode_KIND(self);
11906 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011907
Victor Stinnercc7af722013-04-09 22:39:24 +020011908 i = 0;
11909 if (striptype != RIGHTSTRIP) {
11910 while (i < len) {
11911 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11912 if (!Py_UNICODE_ISSPACE(ch))
11913 break;
11914 i++;
11915 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011916 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011917
11918 j = len;
11919 if (striptype != LEFTSTRIP) {
11920 j--;
11921 while (j >= i) {
11922 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11923 if (!Py_UNICODE_ISSPACE(ch))
11924 break;
11925 j--;
11926 }
11927 j++;
11928 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011929 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011930
Victor Stinner7931d9a2011-11-04 00:22:48 +010011931 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932}
11933
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011934
11935static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011936do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011937{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011938 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011939
Serhiy Storchakac6792272013-10-19 21:03:34 +030011940 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011941 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011942
Benjamin Peterson14339b62009-01-31 16:36:08 +000011943 if (sep != NULL && sep != Py_None) {
11944 if (PyUnicode_Check(sep))
11945 return _PyUnicode_XStrip(self, striptype, sep);
11946 else {
11947 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 "%s arg must be None or str",
11949 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011950 return NULL;
11951 }
11952 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011953
Benjamin Peterson14339b62009-01-31 16:36:08 +000011954 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011955}
11956
11957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011958PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011960\n\
11961Return a copy of the string S with leading and trailing\n\
11962whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011963If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011964
11965static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011966unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011967{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011968 if (PyTuple_GET_SIZE(args) == 0)
11969 return do_strip(self, BOTHSTRIP); /* Common case */
11970 else
11971 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011972}
11973
11974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011975PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011977\n\
11978Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011979If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011980
11981static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011982unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011983{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011984 if (PyTuple_GET_SIZE(args) == 0)
11985 return do_strip(self, LEFTSTRIP); /* Common case */
11986 else
11987 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011988}
11989
11990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011991PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011993\n\
11994Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011995If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011996
11997static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011998unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011999{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012000 if (PyTuple_GET_SIZE(args) == 0)
12001 return do_strip(self, RIGHTSTRIP); /* Common case */
12002 else
12003 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012004}
12005
12006
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012008unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012010 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
Serhiy Storchaka05997252013-01-26 12:14:02 +020012013 if (len < 1)
12014 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
Victor Stinnerc4b49542011-12-11 22:44:26 +010012016 /* no repeat, return original string */
12017 if (len == 1)
12018 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012019
Benjamin Petersonbac79492012-01-14 13:34:47 -050012020 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 return NULL;
12022
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012023 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012024 PyErr_SetString(PyExc_OverflowError,
12025 "repeated string is too long");
12026 return NULL;
12027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012029
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012030 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031 if (!u)
12032 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012033 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (PyUnicode_GET_LENGTH(str) == 1) {
12036 const int kind = PyUnicode_KIND(str);
12037 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012038 if (kind == PyUnicode_1BYTE_KIND) {
12039 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012040 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012041 }
12042 else if (kind == PyUnicode_2BYTE_KIND) {
12043 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012044 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012045 ucs2[n] = fill_char;
12046 } else {
12047 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12048 assert(kind == PyUnicode_4BYTE_KIND);
12049 for (n = 0; n < len; ++n)
12050 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 }
12053 else {
12054 /* number of characters copied this far */
12055 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012056 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 char *to = (char *) PyUnicode_DATA(u);
12058 Py_MEMCPY(to, PyUnicode_DATA(str),
12059 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 n = (done <= nchars-done) ? done : nchars-done;
12062 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012063 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065 }
12066
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012067 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012068 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069}
12070
Alexander Belopolsky40018472011-02-26 01:02:56 +000012071PyObject *
12072PyUnicode_Replace(PyObject *obj,
12073 PyObject *subobj,
12074 PyObject *replobj,
12075 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076{
12077 PyObject *self;
12078 PyObject *str1;
12079 PyObject *str2;
12080 PyObject *result;
12081
12082 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012083 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012086 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 Py_DECREF(self);
12088 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 }
12090 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012091 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 Py_DECREF(self);
12093 Py_DECREF(str1);
12094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012096 if (PyUnicode_READY(self) == -1 ||
12097 PyUnicode_READY(str1) == -1 ||
12098 PyUnicode_READY(str2) == -1)
12099 result = NULL;
12100 else
12101 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102 Py_DECREF(self);
12103 Py_DECREF(str1);
12104 Py_DECREF(str2);
12105 return result;
12106}
12107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012108PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012109 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110\n\
12111Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012112old replaced by new. If the optional argument count is\n\
12113given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114
12115static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 PyObject *str1;
12119 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012120 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121 PyObject *result;
12122
Martin v. Löwis18e16552006-02-15 17:27:45 +000012123 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012125 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012126 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012128 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 return NULL;
12130 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012131 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012132 Py_DECREF(str1);
12133 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012134 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012135 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12136 result = NULL;
12137 else
12138 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139
12140 Py_DECREF(str1);
12141 Py_DECREF(str2);
12142 return result;
12143}
12144
Alexander Belopolsky40018472011-02-26 01:02:56 +000012145static PyObject *
12146unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012148 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 Py_ssize_t isize;
12150 Py_ssize_t osize, squote, dquote, i, o;
12151 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012152 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012156 return NULL;
12157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 isize = PyUnicode_GET_LENGTH(unicode);
12159 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 /* Compute length of output, quote characters, and
12162 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012163 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 max = 127;
12165 squote = dquote = 0;
12166 ikind = PyUnicode_KIND(unicode);
12167 for (i = 0; i < isize; i++) {
12168 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12169 switch (ch) {
12170 case '\'': squote++; osize++; break;
12171 case '"': dquote++; osize++; break;
12172 case '\\': case '\t': case '\r': case '\n':
12173 osize += 2; break;
12174 default:
12175 /* Fast-path ASCII */
12176 if (ch < ' ' || ch == 0x7f)
12177 osize += 4; /* \xHH */
12178 else if (ch < 0x7f)
12179 osize++;
12180 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12181 osize++;
12182 max = ch > max ? ch : max;
12183 }
12184 else if (ch < 0x100)
12185 osize += 4; /* \xHH */
12186 else if (ch < 0x10000)
12187 osize += 6; /* \uHHHH */
12188 else
12189 osize += 10; /* \uHHHHHHHH */
12190 }
12191 }
12192
12193 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012194 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012196 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (dquote)
12198 /* Both squote and dquote present. Use squote,
12199 and escape them */
12200 osize += squote;
12201 else
12202 quote = '"';
12203 }
Victor Stinner55c08782013-04-14 18:45:39 +020012204 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205
12206 repr = PyUnicode_New(osize, max);
12207 if (repr == NULL)
12208 return NULL;
12209 okind = PyUnicode_KIND(repr);
12210 odata = PyUnicode_DATA(repr);
12211
12212 PyUnicode_WRITE(okind, odata, 0, quote);
12213 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012214 if (unchanged) {
12215 _PyUnicode_FastCopyCharacters(repr, 1,
12216 unicode, 0,
12217 isize);
12218 }
12219 else {
12220 for (i = 0, o = 1; i < isize; i++) {
12221 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222
Victor Stinner55c08782013-04-14 18:45:39 +020012223 /* Escape quotes and backslashes */
12224 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012225 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012227 continue;
12228 }
12229
12230 /* Map special whitespace to '\t', \n', '\r' */
12231 if (ch == '\t') {
12232 PyUnicode_WRITE(okind, odata, o++, '\\');
12233 PyUnicode_WRITE(okind, odata, o++, 't');
12234 }
12235 else if (ch == '\n') {
12236 PyUnicode_WRITE(okind, odata, o++, '\\');
12237 PyUnicode_WRITE(okind, odata, o++, 'n');
12238 }
12239 else if (ch == '\r') {
12240 PyUnicode_WRITE(okind, odata, o++, '\\');
12241 PyUnicode_WRITE(okind, odata, o++, 'r');
12242 }
12243
12244 /* Map non-printable US ASCII to '\xhh' */
12245 else if (ch < ' ' || ch == 0x7F) {
12246 PyUnicode_WRITE(okind, odata, o++, '\\');
12247 PyUnicode_WRITE(okind, odata, o++, 'x');
12248 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12249 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12250 }
12251
12252 /* Copy ASCII characters as-is */
12253 else if (ch < 0x7F) {
12254 PyUnicode_WRITE(okind, odata, o++, ch);
12255 }
12256
12257 /* Non-ASCII characters */
12258 else {
12259 /* Map Unicode whitespace and control characters
12260 (categories Z* and C* except ASCII space)
12261 */
12262 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12263 PyUnicode_WRITE(okind, odata, o++, '\\');
12264 /* Map 8-bit characters to '\xhh' */
12265 if (ch <= 0xff) {
12266 PyUnicode_WRITE(okind, odata, o++, 'x');
12267 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12268 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12269 }
12270 /* Map 16-bit characters to '\uxxxx' */
12271 else if (ch <= 0xffff) {
12272 PyUnicode_WRITE(okind, odata, o++, 'u');
12273 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12274 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12275 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12276 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12277 }
12278 /* Map 21-bit characters to '\U00xxxxxx' */
12279 else {
12280 PyUnicode_WRITE(okind, odata, o++, 'U');
12281 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12282 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12283 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12284 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12285 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12286 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12287 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12288 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12289 }
12290 }
12291 /* Copy characters as-is */
12292 else {
12293 PyUnicode_WRITE(okind, odata, o++, ch);
12294 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012295 }
12296 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012299 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012300 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301}
12302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012303PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305\n\
12306Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012307such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308arguments start and end are interpreted as in slice notation.\n\
12309\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012310Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311
12312static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012315 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012316 Py_ssize_t start;
12317 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012318 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319
Jesus Ceaac451502011-04-20 17:09:23 +020012320 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12321 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323
Christian Heimesea71a522013-06-29 21:17:34 +020012324 if (PyUnicode_READY(self) == -1) {
12325 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012327 }
12328 if (PyUnicode_READY(substring) == -1) {
12329 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332
Victor Stinner7931d9a2011-11-04 00:22:48 +010012333 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334
12335 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 if (result == -2)
12338 return NULL;
12339
Christian Heimes217cfd12007-12-02 14:31:20 +000012340 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341}
12342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012343PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012346Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347
12348static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012351 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012352 Py_ssize_t start;
12353 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012354 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355
Jesus Ceaac451502011-04-20 17:09:23 +020012356 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12357 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359
Christian Heimesea71a522013-06-29 21:17:34 +020012360 if (PyUnicode_READY(self) == -1) {
12361 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012363 }
12364 if (PyUnicode_READY(substring) == -1) {
12365 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368
Victor Stinner7931d9a2011-11-04 00:22:48 +010012369 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370
12371 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 if (result == -2)
12374 return NULL;
12375
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376 if (result < 0) {
12377 PyErr_SetString(PyExc_ValueError, "substring not found");
12378 return NULL;
12379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380
Christian Heimes217cfd12007-12-02 14:31:20 +000012381 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382}
12383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012384PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012385 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012387Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012388done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389
12390static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012391unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012393 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 Py_UCS4 fillchar = ' ';
12395
Victor Stinnere9a29352011-10-01 02:14:59 +020012396 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012398
Benjamin Petersonbac79492012-01-14 13:34:47 -050012399 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400 return NULL;
12401
Victor Stinnerc4b49542011-12-11 22:44:26 +010012402 if (PyUnicode_GET_LENGTH(self) >= width)
12403 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404
Victor Stinnerc4b49542011-12-11 22:44:26 +010012405 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406}
12407
Alexander Belopolsky40018472011-02-26 01:02:56 +000012408PyObject *
12409PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410{
12411 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012412
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413 s = PyUnicode_FromObject(s);
12414 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 if (sep != NULL) {
12417 sep = PyUnicode_FromObject(sep);
12418 if (sep == NULL) {
12419 Py_DECREF(s);
12420 return NULL;
12421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 }
12423
Victor Stinner9310abb2011-10-05 00:59:23 +020012424 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425
12426 Py_DECREF(s);
12427 Py_XDECREF(sep);
12428 return result;
12429}
12430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012431PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012432 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012433\n\
12434Return a list of the words in S, using sep as the\n\
12435delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012436splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012437whitespace string is a separator and empty strings are\n\
12438removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012439
12440static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012441unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012443 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012445 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012447 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12448 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449 return NULL;
12450
12451 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012452 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012454 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012456 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457}
12458
Thomas Wouters477c8d52006-05-27 19:21:47 +000012459PyObject *
12460PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12461{
12462 PyObject* str_obj;
12463 PyObject* sep_obj;
12464 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 int kind1, kind2, kind;
12466 void *buf1 = NULL, *buf2 = NULL;
12467 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012468
12469 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012470 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012472 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012473 if (!sep_obj) {
12474 Py_DECREF(str_obj);
12475 return NULL;
12476 }
12477 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12478 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012479 Py_DECREF(str_obj);
12480 return NULL;
12481 }
12482
Victor Stinner14f8f022011-10-05 20:58:25 +020012483 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012485 kind = Py_MAX(kind1, kind2);
12486 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012488 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 if (!buf1)
12490 goto onError;
12491 buf2 = PyUnicode_DATA(sep_obj);
12492 if (kind2 != kind)
12493 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12494 if (!buf2)
12495 goto onError;
12496 len1 = PyUnicode_GET_LENGTH(str_obj);
12497 len2 = PyUnicode_GET_LENGTH(sep_obj);
12498
Benjamin Petersonead6b532011-12-20 17:23:42 -060012499 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012501 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12502 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12503 else
12504 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 break;
12506 case PyUnicode_2BYTE_KIND:
12507 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12508 break;
12509 case PyUnicode_4BYTE_KIND:
12510 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12511 break;
12512 default:
12513 assert(0);
12514 out = 0;
12515 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012516
12517 Py_DECREF(sep_obj);
12518 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 if (kind1 != kind)
12520 PyMem_Free(buf1);
12521 if (kind2 != kind)
12522 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012523
12524 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 onError:
12526 Py_DECREF(sep_obj);
12527 Py_DECREF(str_obj);
12528 if (kind1 != kind && buf1)
12529 PyMem_Free(buf1);
12530 if (kind2 != kind && buf2)
12531 PyMem_Free(buf2);
12532 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012533}
12534
12535
12536PyObject *
12537PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12538{
12539 PyObject* str_obj;
12540 PyObject* sep_obj;
12541 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 int kind1, kind2, kind;
12543 void *buf1 = NULL, *buf2 = NULL;
12544 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012545
12546 str_obj = PyUnicode_FromObject(str_in);
12547 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012549 sep_obj = PyUnicode_FromObject(sep_in);
12550 if (!sep_obj) {
12551 Py_DECREF(str_obj);
12552 return NULL;
12553 }
12554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 kind1 = PyUnicode_KIND(str_in);
12556 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012557 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 buf1 = PyUnicode_DATA(str_in);
12559 if (kind1 != kind)
12560 buf1 = _PyUnicode_AsKind(str_in, kind);
12561 if (!buf1)
12562 goto onError;
12563 buf2 = PyUnicode_DATA(sep_obj);
12564 if (kind2 != kind)
12565 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12566 if (!buf2)
12567 goto onError;
12568 len1 = PyUnicode_GET_LENGTH(str_obj);
12569 len2 = PyUnicode_GET_LENGTH(sep_obj);
12570
Benjamin Petersonead6b532011-12-20 17:23:42 -060012571 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012573 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12574 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12575 else
12576 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 break;
12578 case PyUnicode_2BYTE_KIND:
12579 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12580 break;
12581 case PyUnicode_4BYTE_KIND:
12582 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12583 break;
12584 default:
12585 assert(0);
12586 out = 0;
12587 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012588
12589 Py_DECREF(sep_obj);
12590 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 if (kind1 != kind)
12592 PyMem_Free(buf1);
12593 if (kind2 != kind)
12594 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012595
12596 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 onError:
12598 Py_DECREF(sep_obj);
12599 Py_DECREF(str_obj);
12600 if (kind1 != kind && buf1)
12601 PyMem_Free(buf1);
12602 if (kind2 != kind && buf2)
12603 PyMem_Free(buf2);
12604 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012605}
12606
12607PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012609\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012610Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012611the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012612found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012613
12614static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012615unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012616{
Victor Stinner9310abb2011-10-05 00:59:23 +020012617 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012618}
12619
12620PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012621 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012622\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012623Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012624the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012625separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012626
12627static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012628unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012629{
Victor Stinner9310abb2011-10-05 00:59:23 +020012630 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012631}
12632
Alexander Belopolsky40018472011-02-26 01:02:56 +000012633PyObject *
12634PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012635{
12636 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012637
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012638 s = PyUnicode_FromObject(s);
12639 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012640 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 if (sep != NULL) {
12642 sep = PyUnicode_FromObject(sep);
12643 if (sep == NULL) {
12644 Py_DECREF(s);
12645 return NULL;
12646 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012647 }
12648
Victor Stinner9310abb2011-10-05 00:59:23 +020012649 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012650
12651 Py_DECREF(s);
12652 Py_XDECREF(sep);
12653 return result;
12654}
12655
12656PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012657 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012658\n\
12659Return a list of the words in S, using sep as the\n\
12660delimiter string, starting at the end of the string and\n\
12661working to the front. If maxsplit is given, at most maxsplit\n\
12662splits are done. If sep is not specified, any whitespace string\n\
12663is a separator.");
12664
12665static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012666unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012667{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012668 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012669 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012670 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012671
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012672 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12673 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012674 return NULL;
12675
12676 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012677 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012678 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012679 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012680 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012681 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012682}
12683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012684PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686\n\
12687Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012688Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012689is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690
12691static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012692unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012694 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012695 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012697 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12698 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699 return NULL;
12700
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012701 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702}
12703
12704static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012705PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012707 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708}
12709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012710PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012711 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712\n\
12713Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012714and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715
12716static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012717unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012719 if (PyUnicode_READY(self) == -1)
12720 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012721 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722}
12723
Larry Hastings31826802013-10-19 00:09:25 -070012724/*[clinic]
Larry Hastingsed4a1c52013-11-18 09:32:13 -080012725class str
Georg Brandlceee0772007-11-27 23:48:05 +000012726
Larry Hastings31826802013-10-19 00:09:25 -070012727@staticmethod
12728str.maketrans as unicode_maketrans
12729
12730 x: object
12731
12732 y: unicode=NULL
12733
12734 z: unicode=NULL
12735
12736 /
12737
12738Return a translation table usable for str.translate().
12739
12740If there is only one argument, it must be a dictionary mapping Unicode
12741ordinals (integers) or characters to Unicode ordinals, strings or None.
12742Character keys will be then converted to ordinals.
12743If there are two arguments, they must be strings of equal length, and
12744in the resulting dictionary, each character in x will be mapped to the
12745character at the same position in y. If there is a third argument, it
12746must be a string, whose characters will be mapped to None in the result.
12747[clinic]*/
12748
12749PyDoc_STRVAR(unicode_maketrans__doc__,
12750"Return a translation table usable for str.translate().\n"
12751"\n"
12752"str.maketrans(x, y=None, z=None)\n"
12753"\n"
12754"If there is only one argument, it must be a dictionary mapping Unicode\n"
12755"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12756"Character keys will be then converted to ordinals.\n"
12757"If there are two arguments, they must be strings of equal length, and\n"
12758"in the resulting dictionary, each character in x will be mapped to the\n"
12759"character at the same position in y. If there is a third argument, it\n"
12760"must be a string, whose characters will be mapped to None in the result.");
12761
12762#define UNICODE_MAKETRANS_METHODDEF \
12763 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12764
12765static PyObject *
12766unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
12767
12768static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012769unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012770{
Larry Hastings31826802013-10-19 00:09:25 -070012771 PyObject *return_value = NULL;
12772 PyObject *x;
12773 PyObject *y = NULL;
12774 PyObject *z = NULL;
12775
12776 if (!PyArg_ParseTuple(args,
12777 "O|UU:maketrans",
12778 &x, &y, &z))
12779 goto exit;
12780 return_value = unicode_maketrans_impl(x, y, z);
12781
12782exit:
12783 return return_value;
12784}
12785
12786static PyObject *
12787unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12788/*[clinic checksum: 137db9c3199e7906b7967009f511c24fa3235b5f]*/
12789{
Georg Brandlceee0772007-11-27 23:48:05 +000012790 PyObject *new = NULL, *key, *value;
12791 Py_ssize_t i = 0;
12792 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012793
Georg Brandlceee0772007-11-27 23:48:05 +000012794 new = PyDict_New();
12795 if (!new)
12796 return NULL;
12797 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 int x_kind, y_kind, z_kind;
12799 void *x_data, *y_data, *z_data;
12800
Georg Brandlceee0772007-11-27 23:48:05 +000012801 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012802 if (!PyUnicode_Check(x)) {
12803 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12804 "be a string if there is a second argument");
12805 goto err;
12806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012808 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12809 "arguments must have equal length");
12810 goto err;
12811 }
12812 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 x_kind = PyUnicode_KIND(x);
12814 y_kind = PyUnicode_KIND(y);
12815 x_data = PyUnicode_DATA(x);
12816 y_data = PyUnicode_DATA(y);
12817 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12818 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012819 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012820 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012821 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012822 if (!value) {
12823 Py_DECREF(key);
12824 goto err;
12825 }
Georg Brandlceee0772007-11-27 23:48:05 +000012826 res = PyDict_SetItem(new, key, value);
12827 Py_DECREF(key);
12828 Py_DECREF(value);
12829 if (res < 0)
12830 goto err;
12831 }
12832 /* create entries for deleting chars in z */
12833 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 z_kind = PyUnicode_KIND(z);
12835 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012836 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012838 if (!key)
12839 goto err;
12840 res = PyDict_SetItem(new, key, Py_None);
12841 Py_DECREF(key);
12842 if (res < 0)
12843 goto err;
12844 }
12845 }
12846 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 int kind;
12848 void *data;
12849
Georg Brandlceee0772007-11-27 23:48:05 +000012850 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012851 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012852 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12853 "to maketrans it must be a dict");
12854 goto err;
12855 }
12856 /* copy entries into the new dict, converting string keys to int keys */
12857 while (PyDict_Next(x, &i, &key, &value)) {
12858 if (PyUnicode_Check(key)) {
12859 /* convert string keys to integer keys */
12860 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012861 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012862 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12863 "table must be of length 1");
12864 goto err;
12865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 kind = PyUnicode_KIND(key);
12867 data = PyUnicode_DATA(key);
12868 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012869 if (!newkey)
12870 goto err;
12871 res = PyDict_SetItem(new, newkey, value);
12872 Py_DECREF(newkey);
12873 if (res < 0)
12874 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012875 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012876 /* just keep integer keys */
12877 if (PyDict_SetItem(new, key, value) < 0)
12878 goto err;
12879 } else {
12880 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12881 "be strings or integers");
12882 goto err;
12883 }
12884 }
12885 }
12886 return new;
12887 err:
12888 Py_DECREF(new);
12889 return NULL;
12890}
12891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012892PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012893 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894\n\
12895Return a copy of the string S, where all characters have been mapped\n\
12896through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012897Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012898Unmapped characters are left untouched. Characters mapped to None\n\
12899are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900
12901static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012902unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905}
12906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012907PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012909\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012910Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911
12912static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012913unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012914{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012915 if (PyUnicode_READY(self) == -1)
12916 return NULL;
12917 if (PyUnicode_IS_ASCII(self))
12918 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012919 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012920}
12921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012922PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012925Pad a numeric string S with zeros on the left, to fill a field\n\
12926of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927
12928static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012929unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012931 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012932 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012933 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 int kind;
12935 void *data;
12936 Py_UCS4 chr;
12937
Martin v. Löwis18e16552006-02-15 17:27:45 +000012938 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939 return NULL;
12940
Benjamin Petersonbac79492012-01-14 13:34:47 -050012941 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943
Victor Stinnerc4b49542011-12-11 22:44:26 +010012944 if (PyUnicode_GET_LENGTH(self) >= width)
12945 return unicode_result_unchanged(self);
12946
12947 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948
12949 u = pad(self, fill, 0, '0');
12950
Walter Dörwald068325e2002-04-15 13:36:47 +000012951 if (u == NULL)
12952 return NULL;
12953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 kind = PyUnicode_KIND(u);
12955 data = PyUnicode_DATA(u);
12956 chr = PyUnicode_READ(kind, data, fill);
12957
12958 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 PyUnicode_WRITE(kind, data, 0, chr);
12961 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962 }
12963
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012964 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012965 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012966}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967
12968#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012969static PyObject *
12970unicode__decimal2ascii(PyObject *self)
12971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012973}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974#endif
12975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012976PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012977 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012978\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012979Return True if S starts with the specified prefix, False otherwise.\n\
12980With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012981With optional end, stop comparing S at that position.\n\
12982prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983
12984static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012985unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012988 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012989 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012990 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012991 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012992 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993
Jesus Ceaac451502011-04-20 17:09:23 +020012994 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012995 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012996 if (PyTuple_Check(subobj)) {
12997 Py_ssize_t i;
12998 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012999 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013000 if (substring == NULL)
13001 return NULL;
13002 result = tailmatch(self, substring, start, end, -1);
13003 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013004 if (result == -1)
13005 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013006 if (result) {
13007 Py_RETURN_TRUE;
13008 }
13009 }
13010 /* nothing matched */
13011 Py_RETURN_FALSE;
13012 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013013 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013014 if (substring == NULL) {
13015 if (PyErr_ExceptionMatches(PyExc_TypeError))
13016 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13017 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013018 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013019 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013020 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013021 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013022 if (result == -1)
13023 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013024 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013025}
13026
13027
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013028PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013029 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013030\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013031Return True if S ends with the specified suffix, False otherwise.\n\
13032With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013033With optional end, stop comparing S at that position.\n\
13034suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013035
13036static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013037unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013038 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013039{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013040 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013041 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013042 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013043 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013044 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045
Jesus Ceaac451502011-04-20 17:09:23 +020013046 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013047 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013048 if (PyTuple_Check(subobj)) {
13049 Py_ssize_t i;
13050 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013051 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013052 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013053 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013054 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013055 result = tailmatch(self, substring, start, end, +1);
13056 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013057 if (result == -1)
13058 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013059 if (result) {
13060 Py_RETURN_TRUE;
13061 }
13062 }
13063 Py_RETURN_FALSE;
13064 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013065 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013066 if (substring == NULL) {
13067 if (PyErr_ExceptionMatches(PyExc_TypeError))
13068 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13069 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013071 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013072 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013073 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013074 if (result == -1)
13075 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013076 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077}
13078
Victor Stinner202fdca2012-05-07 12:47:02 +020013079Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013080_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013081{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013082 if (!writer->readonly)
13083 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13084 else {
13085 /* Copy-on-write mode: set buffer size to 0 so
13086 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13087 * next write. */
13088 writer->size = 0;
13089 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013090 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13091 writer->data = PyUnicode_DATA(writer->buffer);
13092 writer->kind = PyUnicode_KIND(writer->buffer);
13093}
13094
Victor Stinnerd3f08822012-05-29 12:57:52 +020013095void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013096_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013097{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013098 memset(writer, 0, sizeof(*writer));
13099#ifdef Py_DEBUG
13100 writer->kind = 5; /* invalid kind */
13101#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013102 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013103}
13104
Victor Stinnerd3f08822012-05-29 12:57:52 +020013105int
13106_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13107 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013108{
13109 Py_ssize_t newlen;
13110 PyObject *newbuffer;
13111
Victor Stinnerd3f08822012-05-29 12:57:52 +020013112 assert(length > 0);
13113
Victor Stinner202fdca2012-05-07 12:47:02 +020013114 if (length > PY_SSIZE_T_MAX - writer->pos) {
13115 PyErr_NoMemory();
13116 return -1;
13117 }
13118 newlen = writer->pos + length;
13119
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013120 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013121
Victor Stinnerd3f08822012-05-29 12:57:52 +020013122 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013123 assert(!writer->readonly);
13124 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013125 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013126 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013127 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013128 if (newlen < writer->min_length)
13129 newlen = writer->min_length;
13130
Victor Stinnerd3f08822012-05-29 12:57:52 +020013131 writer->buffer = PyUnicode_New(newlen, maxchar);
13132 if (writer->buffer == NULL)
13133 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013134 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013135 else if (newlen > writer->size) {
13136 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013137 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013138 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013139 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013140 if (newlen < writer->min_length)
13141 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013142
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013143 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013144 /* resize + widen */
13145 newbuffer = PyUnicode_New(newlen, maxchar);
13146 if (newbuffer == NULL)
13147 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013148 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13149 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013150 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013151 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013152 }
13153 else {
13154 newbuffer = resize_compact(writer->buffer, newlen);
13155 if (newbuffer == NULL)
13156 return -1;
13157 }
13158 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013159 }
13160 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013161 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013162 newbuffer = PyUnicode_New(writer->size, maxchar);
13163 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013164 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013165 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13166 writer->buffer, 0, writer->pos);
13167 Py_DECREF(writer->buffer);
13168 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013169 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013170 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013171 return 0;
13172}
13173
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013174Py_LOCAL_INLINE(int)
13175_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013176{
13177 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13178 return -1;
13179 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13180 writer->pos++;
13181 return 0;
13182}
13183
13184int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013185_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13186{
13187 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13188}
13189
13190int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013191_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13192{
13193 Py_UCS4 maxchar;
13194 Py_ssize_t len;
13195
13196 if (PyUnicode_READY(str) == -1)
13197 return -1;
13198 len = PyUnicode_GET_LENGTH(str);
13199 if (len == 0)
13200 return 0;
13201 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13202 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013203 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013204 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013205 Py_INCREF(str);
13206 writer->buffer = str;
13207 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013208 writer->pos += len;
13209 return 0;
13210 }
13211 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13212 return -1;
13213 }
13214 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13215 str, 0, len);
13216 writer->pos += len;
13217 return 0;
13218}
13219
Victor Stinnere215d962012-10-06 23:03:36 +020013220int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013221_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13222 Py_ssize_t start, Py_ssize_t end)
13223{
13224 Py_UCS4 maxchar;
13225 Py_ssize_t len;
13226
13227 if (PyUnicode_READY(str) == -1)
13228 return -1;
13229
13230 assert(0 <= start);
13231 assert(end <= PyUnicode_GET_LENGTH(str));
13232 assert(start <= end);
13233
13234 if (end == 0)
13235 return 0;
13236
13237 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13238 return _PyUnicodeWriter_WriteStr(writer, str);
13239
13240 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13241 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13242 else
13243 maxchar = writer->maxchar;
13244 len = end - start;
13245
13246 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13247 return -1;
13248
13249 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13250 str, start, len);
13251 writer->pos += len;
13252 return 0;
13253}
13254
13255int
Victor Stinnere215d962012-10-06 23:03:36 +020013256_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13257{
13258 Py_UCS4 maxchar;
13259
13260 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13261 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13262 return -1;
13263 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13264 writer->pos += len;
13265 return 0;
13266}
13267
Victor Stinnerd3f08822012-05-29 12:57:52 +020013268PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013269_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013270{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013271 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013272 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013273 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013274 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013275 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013276 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013277 str = writer->buffer;
13278 writer->buffer = NULL;
13279 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13280 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013281 }
13282 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13283 PyObject *newbuffer;
13284 newbuffer = resize_compact(writer->buffer, writer->pos);
13285 if (newbuffer == NULL) {
13286 Py_DECREF(writer->buffer);
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013287 writer->buffer = NULL;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013288 return NULL;
13289 }
13290 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013291 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013292 str = writer->buffer;
13293 writer->buffer = NULL;
13294 assert(_PyUnicode_CheckConsistency(str, 1));
13295 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013296}
13297
Victor Stinnerd3f08822012-05-29 12:57:52 +020013298void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013299_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013300{
13301 Py_CLEAR(writer->buffer);
13302}
13303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013305
13306PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013307 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013308\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013309Return a formatted version of S, using substitutions from args and kwargs.\n\
13310The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013311
Eric Smith27bbca62010-11-04 17:06:58 +000013312PyDoc_STRVAR(format_map__doc__,
13313 "S.format_map(mapping) -> str\n\
13314\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013315Return a formatted version of S, using substitutions from mapping.\n\
13316The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013317
Eric Smith4a7d76d2008-05-30 18:10:19 +000013318static PyObject *
13319unicode__format__(PyObject* self, PyObject* args)
13320{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013321 PyObject *format_spec;
13322 _PyUnicodeWriter writer;
13323 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013324
13325 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13326 return NULL;
13327
Victor Stinnerd3f08822012-05-29 12:57:52 +020013328 if (PyUnicode_READY(self) == -1)
13329 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013330 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013331 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13332 self, format_spec, 0,
13333 PyUnicode_GET_LENGTH(format_spec));
13334 if (ret == -1) {
13335 _PyUnicodeWriter_Dealloc(&writer);
13336 return NULL;
13337 }
13338 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013339}
13340
Eric Smith8c663262007-08-25 02:26:07 +000013341PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013342 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013343\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013344Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013345
13346static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013347unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013348{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 Py_ssize_t size;
13350
13351 /* If it's a compact object, account for base structure +
13352 character data. */
13353 if (PyUnicode_IS_COMPACT_ASCII(v))
13354 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13355 else if (PyUnicode_IS_COMPACT(v))
13356 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013357 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 else {
13359 /* If it is a two-block object, account for base object, and
13360 for character block if present. */
13361 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013362 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013363 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013364 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365 }
13366 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013367 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013368 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013369 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013370 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013371 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013372
13373 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013374}
13375
13376PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013378
13379static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013380unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013381{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013382 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013383 if (!copy)
13384 return NULL;
13385 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013386}
13387
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013389 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013390 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013391 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13392 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013393 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13394 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013395 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013396 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13397 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13398 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013399 {"expandtabs", (PyCFunction) unicode_expandtabs,
13400 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013401 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013402 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013403 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13404 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13405 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013406 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013407 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13408 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13409 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013410 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013411 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013412 {"splitlines", (PyCFunction) unicode_splitlines,
13413 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013414 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013415 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13416 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13417 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13418 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13419 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13420 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13421 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13422 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13423 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13424 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13425 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13426 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13427 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13428 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013429 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013430 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013431 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013432 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013433 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013434 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013435 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013436 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013437#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013438 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013439 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013440#endif
13441
Benjamin Peterson14339b62009-01-31 16:36:08 +000013442 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443 {NULL, NULL}
13444};
13445
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013446static PyObject *
13447unicode_mod(PyObject *v, PyObject *w)
13448{
Brian Curtindfc80e32011-08-10 20:28:54 -050013449 if (!PyUnicode_Check(v))
13450 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013452}
13453
13454static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013455 0, /*nb_add*/
13456 0, /*nb_subtract*/
13457 0, /*nb_multiply*/
13458 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013459};
13460
Guido van Rossumd57fd912000-03-10 22:53:23 +000013461static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013462 (lenfunc) unicode_length, /* sq_length */
13463 PyUnicode_Concat, /* sq_concat */
13464 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13465 (ssizeargfunc) unicode_getitem, /* sq_item */
13466 0, /* sq_slice */
13467 0, /* sq_ass_item */
13468 0, /* sq_ass_slice */
13469 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013470};
13471
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013472static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013473unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013475 if (PyUnicode_READY(self) == -1)
13476 return NULL;
13477
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013478 if (PyIndex_Check(item)) {
13479 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013480 if (i == -1 && PyErr_Occurred())
13481 return NULL;
13482 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013483 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013484 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013485 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013486 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013487 PyObject *result;
13488 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013489 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013490 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013492 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013493 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013494 return NULL;
13495 }
13496
13497 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013498 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013499 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013500 slicelength == PyUnicode_GET_LENGTH(self)) {
13501 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013502 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013503 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013504 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013505 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013506 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013507 src_kind = PyUnicode_KIND(self);
13508 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013509 if (!PyUnicode_IS_ASCII(self)) {
13510 kind_limit = kind_maxchar_limit(src_kind);
13511 max_char = 0;
13512 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13513 ch = PyUnicode_READ(src_kind, src_data, cur);
13514 if (ch > max_char) {
13515 max_char = ch;
13516 if (max_char >= kind_limit)
13517 break;
13518 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013519 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013520 }
Victor Stinner55c99112011-10-13 01:17:06 +020013521 else
13522 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013523 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013524 if (result == NULL)
13525 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013526 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013527 dest_data = PyUnicode_DATA(result);
13528
13529 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013530 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13531 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013532 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013533 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013534 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013535 } else {
13536 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13537 return NULL;
13538 }
13539}
13540
13541static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013542 (lenfunc)unicode_length, /* mp_length */
13543 (binaryfunc)unicode_subscript, /* mp_subscript */
13544 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013545};
13546
Guido van Rossumd57fd912000-03-10 22:53:23 +000013547
Guido van Rossumd57fd912000-03-10 22:53:23 +000013548/* Helpers for PyUnicode_Format() */
13549
Victor Stinnera47082312012-10-04 02:19:54 +020013550struct unicode_formatter_t {
13551 PyObject *args;
13552 int args_owned;
13553 Py_ssize_t arglen, argidx;
13554 PyObject *dict;
13555
13556 enum PyUnicode_Kind fmtkind;
13557 Py_ssize_t fmtcnt, fmtpos;
13558 void *fmtdata;
13559 PyObject *fmtstr;
13560
13561 _PyUnicodeWriter writer;
13562};
13563
13564struct unicode_format_arg_t {
13565 Py_UCS4 ch;
13566 int flags;
13567 Py_ssize_t width;
13568 int prec;
13569 int sign;
13570};
13571
Guido van Rossumd57fd912000-03-10 22:53:23 +000013572static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013573unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013574{
Victor Stinnera47082312012-10-04 02:19:54 +020013575 Py_ssize_t argidx = ctx->argidx;
13576
13577 if (argidx < ctx->arglen) {
13578 ctx->argidx++;
13579 if (ctx->arglen < 0)
13580 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 else
Victor Stinnera47082312012-10-04 02:19:54 +020013582 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013583 }
13584 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013586 return NULL;
13587}
13588
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013589/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013590
Victor Stinnera47082312012-10-04 02:19:54 +020013591/* Format a float into the writer if the writer is not NULL, or into *p_output
13592 otherwise.
13593
13594 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013595static int
Victor Stinnera47082312012-10-04 02:19:54 +020013596formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13597 PyObject **p_output,
13598 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013599{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013600 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013601 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013602 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013603 int prec;
13604 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013605
Guido van Rossumd57fd912000-03-10 22:53:23 +000013606 x = PyFloat_AsDouble(v);
13607 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013608 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013609
Victor Stinnera47082312012-10-04 02:19:54 +020013610 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013611 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013612 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013613
Victor Stinnera47082312012-10-04 02:19:54 +020013614 if (arg->flags & F_ALT)
13615 dtoa_flags = Py_DTSF_ALT;
13616 else
13617 dtoa_flags = 0;
13618 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013619 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013620 return -1;
13621 len = strlen(p);
13622 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013623 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13624 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013625 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013626 }
Victor Stinner184252a2012-06-16 02:57:41 +020013627 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013628 writer->pos += len;
13629 }
13630 else
13631 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013632 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013633 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013634}
13635
Victor Stinnerd0880d52012-04-27 23:40:13 +020013636/* formatlong() emulates the format codes d, u, o, x and X, and
13637 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13638 * Python's regular ints.
13639 * Return value: a new PyUnicodeObject*, or NULL if error.
13640 * The output string is of the form
13641 * "-"? ("0x" | "0X")? digit+
13642 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13643 * set in flags. The case of hex digits will be correct,
13644 * There will be at least prec digits, zero-filled on the left if
13645 * necessary to get that many.
13646 * val object to be converted
13647 * flags bitmask of format flags; only F_ALT is looked at
13648 * prec minimum number of digits; 0-fill on left if needed
13649 * type a character in [duoxX]; u acts the same as d
13650 *
13651 * CAUTION: o, x and X conversions on regular ints can never
13652 * produce a '-' sign, but can for Python's unbounded ints.
13653 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013654static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013655formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013656{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013657 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013658 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013659 Py_ssize_t i;
13660 int sign; /* 1 if '-', else 0 */
13661 int len; /* number of characters */
13662 Py_ssize_t llen;
13663 int numdigits; /* len == numnondigits + numdigits */
13664 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013665 int prec = arg->prec;
13666 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013667
Victor Stinnerd0880d52012-04-27 23:40:13 +020013668 /* Avoid exceeding SSIZE_T_MAX */
13669 if (prec > INT_MAX-3) {
13670 PyErr_SetString(PyExc_OverflowError,
13671 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013672 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013673 }
13674
13675 assert(PyLong_Check(val));
13676
13677 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013678 default:
13679 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013680 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013681 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013682 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013683 /* int and int subclasses should print numerically when a numeric */
13684 /* format code is used (see issue18780) */
13685 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013686 break;
13687 case 'o':
13688 numnondigits = 2;
13689 result = PyNumber_ToBase(val, 8);
13690 break;
13691 case 'x':
13692 case 'X':
13693 numnondigits = 2;
13694 result = PyNumber_ToBase(val, 16);
13695 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013696 }
13697 if (!result)
13698 return NULL;
13699
13700 assert(unicode_modifiable(result));
13701 assert(PyUnicode_IS_READY(result));
13702 assert(PyUnicode_IS_ASCII(result));
13703
13704 /* To modify the string in-place, there can only be one reference. */
13705 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013706 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013707 PyErr_BadInternalCall();
13708 return NULL;
13709 }
13710 buf = PyUnicode_DATA(result);
13711 llen = PyUnicode_GET_LENGTH(result);
13712 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013713 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013714 PyErr_SetString(PyExc_ValueError,
13715 "string too large in _PyBytes_FormatLong");
13716 return NULL;
13717 }
13718 len = (int)llen;
13719 sign = buf[0] == '-';
13720 numnondigits += sign;
13721 numdigits = len - numnondigits;
13722 assert(numdigits > 0);
13723
13724 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013725 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013726 (type == 'o' || type == 'x' || type == 'X'))) {
13727 assert(buf[sign] == '0');
13728 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13729 buf[sign+1] == 'o');
13730 numnondigits -= 2;
13731 buf += 2;
13732 len -= 2;
13733 if (sign)
13734 buf[0] = '-';
13735 assert(len == numnondigits + numdigits);
13736 assert(numdigits > 0);
13737 }
13738
13739 /* Fill with leading zeroes to meet minimum width. */
13740 if (prec > numdigits) {
13741 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13742 numnondigits + prec);
13743 char *b1;
13744 if (!r1) {
13745 Py_DECREF(result);
13746 return NULL;
13747 }
13748 b1 = PyBytes_AS_STRING(r1);
13749 for (i = 0; i < numnondigits; ++i)
13750 *b1++ = *buf++;
13751 for (i = 0; i < prec - numdigits; i++)
13752 *b1++ = '0';
13753 for (i = 0; i < numdigits; i++)
13754 *b1++ = *buf++;
13755 *b1 = '\0';
13756 Py_DECREF(result);
13757 result = r1;
13758 buf = PyBytes_AS_STRING(result);
13759 len = numnondigits + prec;
13760 }
13761
13762 /* Fix up case for hex conversions. */
13763 if (type == 'X') {
13764 /* Need to convert all lower case letters to upper case.
13765 and need to convert 0x to 0X (and -0x to -0X). */
13766 for (i = 0; i < len; i++)
13767 if (buf[i] >= 'a' && buf[i] <= 'x')
13768 buf[i] -= 'a'-'A';
13769 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013770 if (!PyUnicode_Check(result)
13771 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013772 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013773 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013774 Py_DECREF(result);
13775 result = unicode;
13776 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013777 else if (len != PyUnicode_GET_LENGTH(result)) {
13778 if (PyUnicode_Resize(&result, len) < 0)
13779 Py_CLEAR(result);
13780 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013781 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013782}
13783
Victor Stinner621ef3d2012-10-02 00:33:47 +020013784/* Format an integer.
13785 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013786 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013787 * -1 and raise an exception on error */
13788static int
Victor Stinnera47082312012-10-04 02:19:54 +020013789mainformatlong(PyObject *v,
13790 struct unicode_format_arg_t *arg,
13791 PyObject **p_output,
13792 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013793{
13794 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013795 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013796
13797 if (!PyNumber_Check(v))
13798 goto wrongtype;
13799
13800 if (!PyLong_Check(v)) {
13801 iobj = PyNumber_Long(v);
13802 if (iobj == NULL) {
13803 if (PyErr_ExceptionMatches(PyExc_TypeError))
13804 goto wrongtype;
13805 return -1;
13806 }
13807 assert(PyLong_Check(iobj));
13808 }
13809 else {
13810 iobj = v;
13811 Py_INCREF(iobj);
13812 }
13813
13814 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013815 && arg->width == -1 && arg->prec == -1
13816 && !(arg->flags & (F_SIGN | F_BLANK))
13817 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013818 {
13819 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013820 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013821 int base;
13822
Victor Stinnera47082312012-10-04 02:19:54 +020013823 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013824 {
13825 default:
13826 assert(0 && "'type' not in [diuoxX]");
13827 case 'd':
13828 case 'i':
13829 case 'u':
13830 base = 10;
13831 break;
13832 case 'o':
13833 base = 8;
13834 break;
13835 case 'x':
13836 case 'X':
13837 base = 16;
13838 break;
13839 }
13840
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013841 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13842 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013843 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013844 }
13845 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013846 return 1;
13847 }
13848
Victor Stinnera47082312012-10-04 02:19:54 +020013849 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013850 Py_DECREF(iobj);
13851 if (res == NULL)
13852 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013853 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013854 return 0;
13855
13856wrongtype:
13857 PyErr_Format(PyExc_TypeError,
13858 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013859 "not %.200s",
13860 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013861 return -1;
13862}
13863
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013864static Py_UCS4
13865formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013866{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013867 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013868 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013869 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013870 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013872 goto onError;
13873 }
13874 else {
13875 /* Integer input truncated to a character */
13876 long x;
13877 x = PyLong_AsLong(v);
13878 if (x == -1 && PyErr_Occurred())
13879 goto onError;
13880
Victor Stinner8faf8212011-12-08 22:14:11 +010013881 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013882 PyErr_SetString(PyExc_OverflowError,
13883 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013884 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013885 }
13886
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013887 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013888 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013889
Benjamin Peterson29060642009-01-31 22:14:21 +000013890 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013891 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013892 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013893 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013894}
13895
Victor Stinnera47082312012-10-04 02:19:54 +020013896/* Parse options of an argument: flags, width, precision.
13897 Handle also "%(name)" syntax.
13898
13899 Return 0 if the argument has been formatted into arg->str.
13900 Return 1 if the argument has been written into ctx->writer,
13901 Raise an exception and return -1 on error. */
13902static int
13903unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13904 struct unicode_format_arg_t *arg)
13905{
13906#define FORMAT_READ(ctx) \
13907 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13908
13909 PyObject *v;
13910
Victor Stinnera47082312012-10-04 02:19:54 +020013911 if (arg->ch == '(') {
13912 /* Get argument value from a dictionary. Example: "%(name)s". */
13913 Py_ssize_t keystart;
13914 Py_ssize_t keylen;
13915 PyObject *key;
13916 int pcount = 1;
13917
13918 if (ctx->dict == NULL) {
13919 PyErr_SetString(PyExc_TypeError,
13920 "format requires a mapping");
13921 return -1;
13922 }
13923 ++ctx->fmtpos;
13924 --ctx->fmtcnt;
13925 keystart = ctx->fmtpos;
13926 /* Skip over balanced parentheses */
13927 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13928 arg->ch = FORMAT_READ(ctx);
13929 if (arg->ch == ')')
13930 --pcount;
13931 else if (arg->ch == '(')
13932 ++pcount;
13933 ctx->fmtpos++;
13934 }
13935 keylen = ctx->fmtpos - keystart - 1;
13936 if (ctx->fmtcnt < 0 || pcount > 0) {
13937 PyErr_SetString(PyExc_ValueError,
13938 "incomplete format key");
13939 return -1;
13940 }
13941 key = PyUnicode_Substring(ctx->fmtstr,
13942 keystart, keystart + keylen);
13943 if (key == NULL)
13944 return -1;
13945 if (ctx->args_owned) {
13946 Py_DECREF(ctx->args);
13947 ctx->args_owned = 0;
13948 }
13949 ctx->args = PyObject_GetItem(ctx->dict, key);
13950 Py_DECREF(key);
13951 if (ctx->args == NULL)
13952 return -1;
13953 ctx->args_owned = 1;
13954 ctx->arglen = -1;
13955 ctx->argidx = -2;
13956 }
13957
13958 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013959 while (--ctx->fmtcnt >= 0) {
13960 arg->ch = FORMAT_READ(ctx);
13961 ctx->fmtpos++;
13962 switch (arg->ch) {
13963 case '-': arg->flags |= F_LJUST; continue;
13964 case '+': arg->flags |= F_SIGN; continue;
13965 case ' ': arg->flags |= F_BLANK; continue;
13966 case '#': arg->flags |= F_ALT; continue;
13967 case '0': arg->flags |= F_ZERO; continue;
13968 }
13969 break;
13970 }
13971
13972 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013973 if (arg->ch == '*') {
13974 v = unicode_format_getnextarg(ctx);
13975 if (v == NULL)
13976 return -1;
13977 if (!PyLong_Check(v)) {
13978 PyErr_SetString(PyExc_TypeError,
13979 "* wants int");
13980 return -1;
13981 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013982 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013983 if (arg->width == -1 && PyErr_Occurred())
13984 return -1;
13985 if (arg->width < 0) {
13986 arg->flags |= F_LJUST;
13987 arg->width = -arg->width;
13988 }
13989 if (--ctx->fmtcnt >= 0) {
13990 arg->ch = FORMAT_READ(ctx);
13991 ctx->fmtpos++;
13992 }
13993 }
13994 else if (arg->ch >= '0' && arg->ch <= '9') {
13995 arg->width = arg->ch - '0';
13996 while (--ctx->fmtcnt >= 0) {
13997 arg->ch = FORMAT_READ(ctx);
13998 ctx->fmtpos++;
13999 if (arg->ch < '0' || arg->ch > '9')
14000 break;
14001 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14002 mixing signed and unsigned comparison. Since arg->ch is between
14003 '0' and '9', casting to int is safe. */
14004 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14005 PyErr_SetString(PyExc_ValueError,
14006 "width too big");
14007 return -1;
14008 }
14009 arg->width = arg->width*10 + (arg->ch - '0');
14010 }
14011 }
14012
14013 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014014 if (arg->ch == '.') {
14015 arg->prec = 0;
14016 if (--ctx->fmtcnt >= 0) {
14017 arg->ch = FORMAT_READ(ctx);
14018 ctx->fmtpos++;
14019 }
14020 if (arg->ch == '*') {
14021 v = unicode_format_getnextarg(ctx);
14022 if (v == NULL)
14023 return -1;
14024 if (!PyLong_Check(v)) {
14025 PyErr_SetString(PyExc_TypeError,
14026 "* wants int");
14027 return -1;
14028 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014029 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014030 if (arg->prec == -1 && PyErr_Occurred())
14031 return -1;
14032 if (arg->prec < 0)
14033 arg->prec = 0;
14034 if (--ctx->fmtcnt >= 0) {
14035 arg->ch = FORMAT_READ(ctx);
14036 ctx->fmtpos++;
14037 }
14038 }
14039 else if (arg->ch >= '0' && arg->ch <= '9') {
14040 arg->prec = arg->ch - '0';
14041 while (--ctx->fmtcnt >= 0) {
14042 arg->ch = FORMAT_READ(ctx);
14043 ctx->fmtpos++;
14044 if (arg->ch < '0' || arg->ch > '9')
14045 break;
14046 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14047 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014048 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014049 return -1;
14050 }
14051 arg->prec = arg->prec*10 + (arg->ch - '0');
14052 }
14053 }
14054 }
14055
14056 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14057 if (ctx->fmtcnt >= 0) {
14058 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14059 if (--ctx->fmtcnt >= 0) {
14060 arg->ch = FORMAT_READ(ctx);
14061 ctx->fmtpos++;
14062 }
14063 }
14064 }
14065 if (ctx->fmtcnt < 0) {
14066 PyErr_SetString(PyExc_ValueError,
14067 "incomplete format");
14068 return -1;
14069 }
14070 return 0;
14071
14072#undef FORMAT_READ
14073}
14074
14075/* Format one argument. Supported conversion specifiers:
14076
14077 - "s", "r", "a": any type
14078 - "i", "d", "u", "o", "x", "X": int
14079 - "e", "E", "f", "F", "g", "G": float
14080 - "c": int or str (1 character)
14081
Victor Stinner8dbd4212012-12-04 09:30:24 +010014082 When possible, the output is written directly into the Unicode writer
14083 (ctx->writer). A string is created when padding is required.
14084
Victor Stinnera47082312012-10-04 02:19:54 +020014085 Return 0 if the argument has been formatted into *p_str,
14086 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014087 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014088static int
14089unicode_format_arg_format(struct unicode_formatter_t *ctx,
14090 struct unicode_format_arg_t *arg,
14091 PyObject **p_str)
14092{
14093 PyObject *v;
14094 _PyUnicodeWriter *writer = &ctx->writer;
14095
14096 if (ctx->fmtcnt == 0)
14097 ctx->writer.overallocate = 0;
14098
14099 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014100 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014101 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014102 return 1;
14103 }
14104
14105 v = unicode_format_getnextarg(ctx);
14106 if (v == NULL)
14107 return -1;
14108
Victor Stinnera47082312012-10-04 02:19:54 +020014109
14110 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014111 case 's':
14112 case 'r':
14113 case 'a':
14114 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14115 /* Fast path */
14116 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14117 return -1;
14118 return 1;
14119 }
14120
14121 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14122 *p_str = v;
14123 Py_INCREF(*p_str);
14124 }
14125 else {
14126 if (arg->ch == 's')
14127 *p_str = PyObject_Str(v);
14128 else if (arg->ch == 'r')
14129 *p_str = PyObject_Repr(v);
14130 else
14131 *p_str = PyObject_ASCII(v);
14132 }
14133 break;
14134
14135 case 'i':
14136 case 'd':
14137 case 'u':
14138 case 'o':
14139 case 'x':
14140 case 'X':
14141 {
14142 int ret = mainformatlong(v, arg, p_str, writer);
14143 if (ret != 0)
14144 return ret;
14145 arg->sign = 1;
14146 break;
14147 }
14148
14149 case 'e':
14150 case 'E':
14151 case 'f':
14152 case 'F':
14153 case 'g':
14154 case 'G':
14155 if (arg->width == -1 && arg->prec == -1
14156 && !(arg->flags & (F_SIGN | F_BLANK)))
14157 {
14158 /* Fast path */
14159 if (formatfloat(v, arg, NULL, writer) == -1)
14160 return -1;
14161 return 1;
14162 }
14163
14164 arg->sign = 1;
14165 if (formatfloat(v, arg, p_str, NULL) == -1)
14166 return -1;
14167 break;
14168
14169 case 'c':
14170 {
14171 Py_UCS4 ch = formatchar(v);
14172 if (ch == (Py_UCS4) -1)
14173 return -1;
14174 if (arg->width == -1 && arg->prec == -1) {
14175 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014176 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014177 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014178 return 1;
14179 }
14180 *p_str = PyUnicode_FromOrdinal(ch);
14181 break;
14182 }
14183
14184 default:
14185 PyErr_Format(PyExc_ValueError,
14186 "unsupported format character '%c' (0x%x) "
14187 "at index %zd",
14188 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14189 (int)arg->ch,
14190 ctx->fmtpos - 1);
14191 return -1;
14192 }
14193 if (*p_str == NULL)
14194 return -1;
14195 assert (PyUnicode_Check(*p_str));
14196 return 0;
14197}
14198
14199static int
14200unicode_format_arg_output(struct unicode_formatter_t *ctx,
14201 struct unicode_format_arg_t *arg,
14202 PyObject *str)
14203{
14204 Py_ssize_t len;
14205 enum PyUnicode_Kind kind;
14206 void *pbuf;
14207 Py_ssize_t pindex;
14208 Py_UCS4 signchar;
14209 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014210 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014211 Py_ssize_t sublen;
14212 _PyUnicodeWriter *writer = &ctx->writer;
14213 Py_UCS4 fill;
14214
14215 fill = ' ';
14216 if (arg->sign && arg->flags & F_ZERO)
14217 fill = '0';
14218
14219 if (PyUnicode_READY(str) == -1)
14220 return -1;
14221
14222 len = PyUnicode_GET_LENGTH(str);
14223 if ((arg->width == -1 || arg->width <= len)
14224 && (arg->prec == -1 || arg->prec >= len)
14225 && !(arg->flags & (F_SIGN | F_BLANK)))
14226 {
14227 /* Fast path */
14228 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14229 return -1;
14230 return 0;
14231 }
14232
14233 /* Truncate the string for "s", "r" and "a" formats
14234 if the precision is set */
14235 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14236 if (arg->prec >= 0 && len > arg->prec)
14237 len = arg->prec;
14238 }
14239
14240 /* Adjust sign and width */
14241 kind = PyUnicode_KIND(str);
14242 pbuf = PyUnicode_DATA(str);
14243 pindex = 0;
14244 signchar = '\0';
14245 if (arg->sign) {
14246 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14247 if (ch == '-' || ch == '+') {
14248 signchar = ch;
14249 len--;
14250 pindex++;
14251 }
14252 else if (arg->flags & F_SIGN)
14253 signchar = '+';
14254 else if (arg->flags & F_BLANK)
14255 signchar = ' ';
14256 else
14257 arg->sign = 0;
14258 }
14259 if (arg->width < len)
14260 arg->width = len;
14261
14262 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014263 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014264 if (!(arg->flags & F_LJUST)) {
14265 if (arg->sign) {
14266 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014267 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014268 }
14269 else {
14270 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014271 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014272 }
14273 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014274 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14275 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014276 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014277 }
14278
Victor Stinnera47082312012-10-04 02:19:54 +020014279 buflen = arg->width;
14280 if (arg->sign && len == arg->width)
14281 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014282 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014283 return -1;
14284
14285 /* Write the sign if needed */
14286 if (arg->sign) {
14287 if (fill != ' ') {
14288 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14289 writer->pos += 1;
14290 }
14291 if (arg->width > len)
14292 arg->width--;
14293 }
14294
14295 /* Write the numeric prefix for "x", "X" and "o" formats
14296 if the alternate form is used.
14297 For example, write "0x" for the "%#x" format. */
14298 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14299 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14300 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14301 if (fill != ' ') {
14302 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14303 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14304 writer->pos += 2;
14305 pindex += 2;
14306 }
14307 arg->width -= 2;
14308 if (arg->width < 0)
14309 arg->width = 0;
14310 len -= 2;
14311 }
14312
14313 /* Pad left with the fill character if needed */
14314 if (arg->width > len && !(arg->flags & F_LJUST)) {
14315 sublen = arg->width - len;
14316 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14317 writer->pos += sublen;
14318 arg->width = len;
14319 }
14320
14321 /* If padding with spaces: write sign if needed and/or numeric prefix if
14322 the alternate form is used */
14323 if (fill == ' ') {
14324 if (arg->sign) {
14325 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14326 writer->pos += 1;
14327 }
14328 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14329 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14330 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14331 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14332 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14333 writer->pos += 2;
14334 pindex += 2;
14335 }
14336 }
14337
14338 /* Write characters */
14339 if (len) {
14340 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14341 str, pindex, len);
14342 writer->pos += len;
14343 }
14344
14345 /* Pad right with the fill character if needed */
14346 if (arg->width > len) {
14347 sublen = arg->width - len;
14348 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14349 writer->pos += sublen;
14350 }
14351 return 0;
14352}
14353
14354/* Helper of PyUnicode_Format(): format one arg.
14355 Return 0 on success, raise an exception and return -1 on error. */
14356static int
14357unicode_format_arg(struct unicode_formatter_t *ctx)
14358{
14359 struct unicode_format_arg_t arg;
14360 PyObject *str;
14361 int ret;
14362
Victor Stinner8dbd4212012-12-04 09:30:24 +010014363 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14364 arg.flags = 0;
14365 arg.width = -1;
14366 arg.prec = -1;
14367 arg.sign = 0;
14368 str = NULL;
14369
Victor Stinnera47082312012-10-04 02:19:54 +020014370 ret = unicode_format_arg_parse(ctx, &arg);
14371 if (ret == -1)
14372 return -1;
14373
14374 ret = unicode_format_arg_format(ctx, &arg, &str);
14375 if (ret == -1)
14376 return -1;
14377
14378 if (ret != 1) {
14379 ret = unicode_format_arg_output(ctx, &arg, str);
14380 Py_DECREF(str);
14381 if (ret == -1)
14382 return -1;
14383 }
14384
14385 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14386 PyErr_SetString(PyExc_TypeError,
14387 "not all arguments converted during string formatting");
14388 return -1;
14389 }
14390 return 0;
14391}
14392
Alexander Belopolsky40018472011-02-26 01:02:56 +000014393PyObject *
14394PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014395{
Victor Stinnera47082312012-10-04 02:19:54 +020014396 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014397
Guido van Rossumd57fd912000-03-10 22:53:23 +000014398 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014399 PyErr_BadInternalCall();
14400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014401 }
Victor Stinnera47082312012-10-04 02:19:54 +020014402
14403 ctx.fmtstr = PyUnicode_FromObject(format);
14404 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014405 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014406 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14407 Py_DECREF(ctx.fmtstr);
14408 return NULL;
14409 }
14410 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14411 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14412 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14413 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014414
Victor Stinner8f674cc2013-04-17 23:02:17 +020014415 _PyUnicodeWriter_Init(&ctx.writer);
14416 ctx.writer.min_length = ctx.fmtcnt + 100;
14417 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014418
Guido van Rossumd57fd912000-03-10 22:53:23 +000014419 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014420 ctx.arglen = PyTuple_Size(args);
14421 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014422 }
14423 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014424 ctx.arglen = -1;
14425 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014426 }
Victor Stinnera47082312012-10-04 02:19:54 +020014427 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014428 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014429 ctx.dict = args;
14430 else
14431 ctx.dict = NULL;
14432 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014433
Victor Stinnera47082312012-10-04 02:19:54 +020014434 while (--ctx.fmtcnt >= 0) {
14435 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014436 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014437
14438 nonfmtpos = ctx.fmtpos++;
14439 while (ctx.fmtcnt >= 0 &&
14440 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14441 ctx.fmtpos++;
14442 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014443 }
Victor Stinnera47082312012-10-04 02:19:54 +020014444 if (ctx.fmtcnt < 0) {
14445 ctx.fmtpos--;
14446 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014447 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014448
Victor Stinnercfc4c132013-04-03 01:48:39 +020014449 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14450 nonfmtpos, ctx.fmtpos) < 0)
14451 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014452 }
14453 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014454 ctx.fmtpos++;
14455 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014456 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014457 }
14458 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014459
Victor Stinnera47082312012-10-04 02:19:54 +020014460 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014461 PyErr_SetString(PyExc_TypeError,
14462 "not all arguments converted during string formatting");
14463 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014464 }
14465
Victor Stinnera47082312012-10-04 02:19:54 +020014466 if (ctx.args_owned) {
14467 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014468 }
Victor Stinnera47082312012-10-04 02:19:54 +020014469 Py_DECREF(ctx.fmtstr);
14470 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014471
Benjamin Peterson29060642009-01-31 22:14:21 +000014472 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014473 Py_DECREF(ctx.fmtstr);
14474 _PyUnicodeWriter_Dealloc(&ctx.writer);
14475 if (ctx.args_owned) {
14476 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014477 }
14478 return NULL;
14479}
14480
Jeremy Hylton938ace62002-07-17 16:30:39 +000014481static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014482unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14483
Tim Peters6d6c1a32001-08-02 04:15:00 +000014484static PyObject *
14485unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14486{
Benjamin Peterson29060642009-01-31 22:14:21 +000014487 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014488 static char *kwlist[] = {"object", "encoding", "errors", 0};
14489 char *encoding = NULL;
14490 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014491
Benjamin Peterson14339b62009-01-31 16:36:08 +000014492 if (type != &PyUnicode_Type)
14493 return unicode_subtype_new(type, args, kwds);
14494 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014495 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014496 return NULL;
14497 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014498 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014499 if (encoding == NULL && errors == NULL)
14500 return PyObject_Str(x);
14501 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014502 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014503}
14504
Guido van Rossume023fe02001-08-30 03:12:59 +000014505static PyObject *
14506unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14507{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014508 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014509 Py_ssize_t length, char_size;
14510 int share_wstr, share_utf8;
14511 unsigned int kind;
14512 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014513
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014515
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014516 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014517 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014518 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014519 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014520 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014521 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014522 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014523 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014524
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014525 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014526 if (self == NULL) {
14527 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014528 return NULL;
14529 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014530 kind = PyUnicode_KIND(unicode);
14531 length = PyUnicode_GET_LENGTH(unicode);
14532
14533 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014534#ifdef Py_DEBUG
14535 _PyUnicode_HASH(self) = -1;
14536#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014537 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014538#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014539 _PyUnicode_STATE(self).interned = 0;
14540 _PyUnicode_STATE(self).kind = kind;
14541 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014542 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014543 _PyUnicode_STATE(self).ready = 1;
14544 _PyUnicode_WSTR(self) = NULL;
14545 _PyUnicode_UTF8_LENGTH(self) = 0;
14546 _PyUnicode_UTF8(self) = NULL;
14547 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014548 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014549
14550 share_utf8 = 0;
14551 share_wstr = 0;
14552 if (kind == PyUnicode_1BYTE_KIND) {
14553 char_size = 1;
14554 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14555 share_utf8 = 1;
14556 }
14557 else if (kind == PyUnicode_2BYTE_KIND) {
14558 char_size = 2;
14559 if (sizeof(wchar_t) == 2)
14560 share_wstr = 1;
14561 }
14562 else {
14563 assert(kind == PyUnicode_4BYTE_KIND);
14564 char_size = 4;
14565 if (sizeof(wchar_t) == 4)
14566 share_wstr = 1;
14567 }
14568
14569 /* Ensure we won't overflow the length. */
14570 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14571 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014572 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014573 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014574 data = PyObject_MALLOC((length + 1) * char_size);
14575 if (data == NULL) {
14576 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014577 goto onError;
14578 }
14579
Victor Stinnerc3c74152011-10-02 20:39:55 +020014580 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014581 if (share_utf8) {
14582 _PyUnicode_UTF8_LENGTH(self) = length;
14583 _PyUnicode_UTF8(self) = data;
14584 }
14585 if (share_wstr) {
14586 _PyUnicode_WSTR_LENGTH(self) = length;
14587 _PyUnicode_WSTR(self) = (wchar_t *)data;
14588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014589
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014590 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014591 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014592 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014593#ifdef Py_DEBUG
14594 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14595#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014596 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014597 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014598
14599onError:
14600 Py_DECREF(unicode);
14601 Py_DECREF(self);
14602 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014603}
14604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014605PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014606"str(object='') -> str\n\
14607str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014608\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014609Create a new string object from the given object. If encoding or\n\
14610errors is specified, then the object must expose a data buffer\n\
14611that will be decoded using the given encoding and error handler.\n\
14612Otherwise, returns the result of object.__str__() (if defined)\n\
14613or repr(object).\n\
14614encoding defaults to sys.getdefaultencoding().\n\
14615errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014616
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014617static PyObject *unicode_iter(PyObject *seq);
14618
Guido van Rossumd57fd912000-03-10 22:53:23 +000014619PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014620 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014621 "str", /* tp_name */
14622 sizeof(PyUnicodeObject), /* tp_size */
14623 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014624 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014625 (destructor)unicode_dealloc, /* tp_dealloc */
14626 0, /* tp_print */
14627 0, /* tp_getattr */
14628 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014629 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014630 unicode_repr, /* tp_repr */
14631 &unicode_as_number, /* tp_as_number */
14632 &unicode_as_sequence, /* tp_as_sequence */
14633 &unicode_as_mapping, /* tp_as_mapping */
14634 (hashfunc) unicode_hash, /* tp_hash*/
14635 0, /* tp_call*/
14636 (reprfunc) unicode_str, /* tp_str */
14637 PyObject_GenericGetAttr, /* tp_getattro */
14638 0, /* tp_setattro */
14639 0, /* tp_as_buffer */
14640 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014641 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014642 unicode_doc, /* tp_doc */
14643 0, /* tp_traverse */
14644 0, /* tp_clear */
14645 PyUnicode_RichCompare, /* tp_richcompare */
14646 0, /* tp_weaklistoffset */
14647 unicode_iter, /* tp_iter */
14648 0, /* tp_iternext */
14649 unicode_methods, /* tp_methods */
14650 0, /* tp_members */
14651 0, /* tp_getset */
14652 &PyBaseObject_Type, /* tp_base */
14653 0, /* tp_dict */
14654 0, /* tp_descr_get */
14655 0, /* tp_descr_set */
14656 0, /* tp_dictoffset */
14657 0, /* tp_init */
14658 0, /* tp_alloc */
14659 unicode_new, /* tp_new */
14660 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014661};
14662
14663/* Initialize the Unicode implementation */
14664
Victor Stinner3a50e702011-10-18 21:21:00 +020014665int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014666{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014667 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014668 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014669 0x000A, /* LINE FEED */
14670 0x000D, /* CARRIAGE RETURN */
14671 0x001C, /* FILE SEPARATOR */
14672 0x001D, /* GROUP SEPARATOR */
14673 0x001E, /* RECORD SEPARATOR */
14674 0x0085, /* NEXT LINE */
14675 0x2028, /* LINE SEPARATOR */
14676 0x2029, /* PARAGRAPH SEPARATOR */
14677 };
14678
Fred Drakee4315f52000-05-09 19:53:39 +000014679 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014680 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014681 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014682 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014683 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014684
Guido van Rossumcacfc072002-05-24 19:01:59 +000014685 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014686 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014687
14688 /* initialize the linebreak bloom filter */
14689 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014690 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014691 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014692
Christian Heimes26532f72013-07-20 14:57:16 +020014693 if (PyType_Ready(&EncodingMapType) < 0)
14694 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014695
Benjamin Petersonc4311282012-10-30 23:21:10 -040014696 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14697 Py_FatalError("Can't initialize field name iterator type");
14698
14699 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14700 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014701
Victor Stinner3a50e702011-10-18 21:21:00 +020014702#ifdef HAVE_MBCS
14703 winver.dwOSVersionInfoSize = sizeof(winver);
14704 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14705 PyErr_SetFromWindowsErr(0);
14706 return -1;
14707 }
14708#endif
14709 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014710}
14711
14712/* Finalize the Unicode implementation */
14713
Christian Heimesa156e092008-02-16 07:38:31 +000014714int
14715PyUnicode_ClearFreeList(void)
14716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014717 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014718}
14719
Guido van Rossumd57fd912000-03-10 22:53:23 +000014720void
Thomas Wouters78890102000-07-22 19:25:51 +000014721_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014722{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014723 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014724
Serhiy Storchaka05997252013-01-26 12:14:02 +020014725 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014726
Serhiy Storchaka05997252013-01-26 12:14:02 +020014727 for (i = 0; i < 256; i++)
14728 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014729 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014730 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014731}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014732
Walter Dörwald16807132007-05-25 13:52:07 +000014733void
14734PyUnicode_InternInPlace(PyObject **p)
14735{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014736 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014737 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014738#ifdef Py_DEBUG
14739 assert(s != NULL);
14740 assert(_PyUnicode_CHECK(s));
14741#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014742 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014743 return;
14744#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014745 /* If it's a subclass, we don't really know what putting
14746 it in the interned dict might do. */
14747 if (!PyUnicode_CheckExact(s))
14748 return;
14749 if (PyUnicode_CHECK_INTERNED(s))
14750 return;
14751 if (interned == NULL) {
14752 interned = PyDict_New();
14753 if (interned == NULL) {
14754 PyErr_Clear(); /* Don't leave an exception */
14755 return;
14756 }
14757 }
14758 /* It might be that the GetItem call fails even
14759 though the key is present in the dictionary,
14760 namely when this happens during a stack overflow. */
14761 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014762 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014763 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014764
Victor Stinnerf0335102013-04-14 19:13:03 +020014765 if (t) {
14766 Py_INCREF(t);
14767 Py_DECREF(*p);
14768 *p = t;
14769 return;
14770 }
Walter Dörwald16807132007-05-25 13:52:07 +000014771
Benjamin Peterson14339b62009-01-31 16:36:08 +000014772 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014773 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014774 PyErr_Clear();
14775 PyThreadState_GET()->recursion_critical = 0;
14776 return;
14777 }
14778 PyThreadState_GET()->recursion_critical = 0;
14779 /* The two references in interned are not counted by refcnt.
14780 The deallocator will take care of this */
14781 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014782 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014783}
14784
14785void
14786PyUnicode_InternImmortal(PyObject **p)
14787{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014788 PyUnicode_InternInPlace(p);
14789 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014790 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014791 Py_INCREF(*p);
14792 }
Walter Dörwald16807132007-05-25 13:52:07 +000014793}
14794
14795PyObject *
14796PyUnicode_InternFromString(const char *cp)
14797{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014798 PyObject *s = PyUnicode_FromString(cp);
14799 if (s == NULL)
14800 return NULL;
14801 PyUnicode_InternInPlace(&s);
14802 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014803}
14804
Alexander Belopolsky40018472011-02-26 01:02:56 +000014805void
14806_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014807{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014808 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014809 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014810 Py_ssize_t i, n;
14811 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014812
Benjamin Peterson14339b62009-01-31 16:36:08 +000014813 if (interned == NULL || !PyDict_Check(interned))
14814 return;
14815 keys = PyDict_Keys(interned);
14816 if (keys == NULL || !PyList_Check(keys)) {
14817 PyErr_Clear();
14818 return;
14819 }
Walter Dörwald16807132007-05-25 13:52:07 +000014820
Benjamin Peterson14339b62009-01-31 16:36:08 +000014821 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14822 detector, interned unicode strings are not forcibly deallocated;
14823 rather, we give them their stolen references back, and then clear
14824 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014825
Benjamin Peterson14339b62009-01-31 16:36:08 +000014826 n = PyList_GET_SIZE(keys);
14827 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014828 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014829 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014830 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014831 if (PyUnicode_READY(s) == -1) {
14832 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014833 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014835 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014836 case SSTATE_NOT_INTERNED:
14837 /* XXX Shouldn't happen */
14838 break;
14839 case SSTATE_INTERNED_IMMORTAL:
14840 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014841 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014842 break;
14843 case SSTATE_INTERNED_MORTAL:
14844 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014845 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014846 break;
14847 default:
14848 Py_FatalError("Inconsistent interned string state.");
14849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014850 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014851 }
14852 fprintf(stderr, "total size of all interned strings: "
14853 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14854 "mortal/immortal\n", mortal_size, immortal_size);
14855 Py_DECREF(keys);
14856 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014857 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014858}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014859
14860
14861/********************* Unicode Iterator **************************/
14862
14863typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014864 PyObject_HEAD
14865 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014866 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014867} unicodeiterobject;
14868
14869static void
14870unicodeiter_dealloc(unicodeiterobject *it)
14871{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014872 _PyObject_GC_UNTRACK(it);
14873 Py_XDECREF(it->it_seq);
14874 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014875}
14876
14877static int
14878unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14879{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014880 Py_VISIT(it->it_seq);
14881 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014882}
14883
14884static PyObject *
14885unicodeiter_next(unicodeiterobject *it)
14886{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014887 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014888
Benjamin Peterson14339b62009-01-31 16:36:08 +000014889 assert(it != NULL);
14890 seq = it->it_seq;
14891 if (seq == NULL)
14892 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014893 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014895 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14896 int kind = PyUnicode_KIND(seq);
14897 void *data = PyUnicode_DATA(seq);
14898 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14899 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014900 if (item != NULL)
14901 ++it->it_index;
14902 return item;
14903 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014904
Benjamin Peterson14339b62009-01-31 16:36:08 +000014905 Py_DECREF(seq);
14906 it->it_seq = NULL;
14907 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014908}
14909
14910static PyObject *
14911unicodeiter_len(unicodeiterobject *it)
14912{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014913 Py_ssize_t len = 0;
14914 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014915 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014916 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014917}
14918
14919PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14920
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014921static PyObject *
14922unicodeiter_reduce(unicodeiterobject *it)
14923{
14924 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014925 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014926 it->it_seq, it->it_index);
14927 } else {
14928 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14929 if (u == NULL)
14930 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014931 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014932 }
14933}
14934
14935PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14936
14937static PyObject *
14938unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14939{
14940 Py_ssize_t index = PyLong_AsSsize_t(state);
14941 if (index == -1 && PyErr_Occurred())
14942 return NULL;
14943 if (index < 0)
14944 index = 0;
14945 it->it_index = index;
14946 Py_RETURN_NONE;
14947}
14948
14949PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14950
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014951static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014952 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014953 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014954 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14955 reduce_doc},
14956 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14957 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014958 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014959};
14960
14961PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014962 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14963 "str_iterator", /* tp_name */
14964 sizeof(unicodeiterobject), /* tp_basicsize */
14965 0, /* tp_itemsize */
14966 /* methods */
14967 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14968 0, /* tp_print */
14969 0, /* tp_getattr */
14970 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014971 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 0, /* tp_repr */
14973 0, /* tp_as_number */
14974 0, /* tp_as_sequence */
14975 0, /* tp_as_mapping */
14976 0, /* tp_hash */
14977 0, /* tp_call */
14978 0, /* tp_str */
14979 PyObject_GenericGetAttr, /* tp_getattro */
14980 0, /* tp_setattro */
14981 0, /* tp_as_buffer */
14982 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14983 0, /* tp_doc */
14984 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14985 0, /* tp_clear */
14986 0, /* tp_richcompare */
14987 0, /* tp_weaklistoffset */
14988 PyObject_SelfIter, /* tp_iter */
14989 (iternextfunc)unicodeiter_next, /* tp_iternext */
14990 unicodeiter_methods, /* tp_methods */
14991 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014992};
14993
14994static PyObject *
14995unicode_iter(PyObject *seq)
14996{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014997 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014998
Benjamin Peterson14339b62009-01-31 16:36:08 +000014999 if (!PyUnicode_Check(seq)) {
15000 PyErr_BadInternalCall();
15001 return NULL;
15002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015003 if (PyUnicode_READY(seq) == -1)
15004 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015005 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15006 if (it == NULL)
15007 return NULL;
15008 it->it_index = 0;
15009 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015010 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015011 _PyObject_GC_TRACK(it);
15012 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015013}
15014
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015015
15016size_t
15017Py_UNICODE_strlen(const Py_UNICODE *u)
15018{
15019 int res = 0;
15020 while(*u++)
15021 res++;
15022 return res;
15023}
15024
15025Py_UNICODE*
15026Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15027{
15028 Py_UNICODE *u = s1;
15029 while ((*u++ = *s2++));
15030 return s1;
15031}
15032
15033Py_UNICODE*
15034Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15035{
15036 Py_UNICODE *u = s1;
15037 while ((*u++ = *s2++))
15038 if (n-- == 0)
15039 break;
15040 return s1;
15041}
15042
15043Py_UNICODE*
15044Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15045{
15046 Py_UNICODE *u1 = s1;
15047 u1 += Py_UNICODE_strlen(u1);
15048 Py_UNICODE_strcpy(u1, s2);
15049 return s1;
15050}
15051
15052int
15053Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15054{
15055 while (*s1 && *s2 && *s1 == *s2)
15056 s1++, s2++;
15057 if (*s1 && *s2)
15058 return (*s1 < *s2) ? -1 : +1;
15059 if (*s1)
15060 return 1;
15061 if (*s2)
15062 return -1;
15063 return 0;
15064}
15065
15066int
15067Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015069 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015070 for (; n != 0; n--) {
15071 u1 = *s1;
15072 u2 = *s2;
15073 if (u1 != u2)
15074 return (u1 < u2) ? -1 : +1;
15075 if (u1 == '\0')
15076 return 0;
15077 s1++;
15078 s2++;
15079 }
15080 return 0;
15081}
15082
15083Py_UNICODE*
15084Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15085{
15086 const Py_UNICODE *p;
15087 for (p = s; *p; p++)
15088 if (*p == c)
15089 return (Py_UNICODE*)p;
15090 return NULL;
15091}
15092
15093Py_UNICODE*
15094Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15095{
15096 const Py_UNICODE *p;
15097 p = s + Py_UNICODE_strlen(s);
15098 while (p != s) {
15099 p--;
15100 if (*p == c)
15101 return (Py_UNICODE*)p;
15102 }
15103 return NULL;
15104}
Victor Stinner331ea922010-08-10 16:37:20 +000015105
Victor Stinner71133ff2010-09-01 23:43:53 +000015106Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015107PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015108{
Victor Stinner577db2c2011-10-11 22:12:48 +020015109 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015110 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015112 if (!PyUnicode_Check(unicode)) {
15113 PyErr_BadArgument();
15114 return NULL;
15115 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015116 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015117 if (u == NULL)
15118 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015119 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015120 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015121 PyErr_NoMemory();
15122 return NULL;
15123 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015124 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015125 size *= sizeof(Py_UNICODE);
15126 copy = PyMem_Malloc(size);
15127 if (copy == NULL) {
15128 PyErr_NoMemory();
15129 return NULL;
15130 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015131 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015132 return copy;
15133}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015134
Georg Brandl66c221e2010-10-14 07:04:07 +000015135/* A _string module, to export formatter_parser and formatter_field_name_split
15136 to the string.Formatter class implemented in Python. */
15137
15138static PyMethodDef _string_methods[] = {
15139 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15140 METH_O, PyDoc_STR("split the argument as a field name")},
15141 {"formatter_parser", (PyCFunction) formatter_parser,
15142 METH_O, PyDoc_STR("parse the argument as a format string")},
15143 {NULL, NULL}
15144};
15145
15146static struct PyModuleDef _string_module = {
15147 PyModuleDef_HEAD_INIT,
15148 "_string",
15149 PyDoc_STR("string helper module"),
15150 0,
15151 _string_methods,
15152 NULL,
15153 NULL,
15154 NULL,
15155 NULL
15156};
15157
15158PyMODINIT_FUNC
15159PyInit__string(void)
15160{
15161 return PyModule_Create(&_string_module);
15162}
15163
15164
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015165#ifdef __cplusplus
15166}
15167#endif