blob: d4cb9c985da1060e23055edeee64b02114f95755 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinnere6abb482012-05-02 01:15:40 +0200107/* Optimized version of Py_MAX() to compute the maximum character:
108 use it when your are computing the second argument of PyUnicode_New() */
109#define MAX_MAXCHAR(maxchar1, maxchar2) \
110 ((maxchar1) | (maxchar2))
111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (!PyUnicode_IS_COMPACT_ASCII(op) \
132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (_PyUnicode_WSTR(op) && \
140 (!PyUnicode_IS_READY(op) || \
141 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
142
Victor Stinner910337b2011-10-03 03:20:16 +0200143/* Generic helper macro to convert characters of different types.
144 from_type and to_type have to be valid type names, begin and end
145 are pointers to the source characters which should be of type
146 "from_type *". to is a pointer of type "to_type *" and points to the
147 buffer where the result characters are written to. */
148#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
149 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200150 to_type *_to = (to_type *) to; \
151 const from_type *_iter = (begin); \
152 const from_type *_end = (end); \
153 Py_ssize_t n = (_end) - (_iter); \
154 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200155 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_unrolled_end)) { \
157 _to[0] = (to_type) _iter[0]; \
158 _to[1] = (to_type) _iter[1]; \
159 _to[2] = (to_type) _iter[2]; \
160 _to[3] = (to_type) _iter[3]; \
161 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200162 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_end)) \
164 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166
Walter Dörwald16807132007-05-25 13:52:07 +0000167/* This dictionary holds all interned unicode strings. Note that references
168 to strings in this dictionary are *not* counted in the string's ob_refcnt.
169 When the interned string reaches a refcnt of 0 the string deallocation
170 function will delete the reference from this dictionary.
171
172 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000173 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000174*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200175static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000177/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179
Serhiy Storchaka678db842013-01-26 12:16:36 +0200180#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200181 do { \
182 if (unicode_empty != NULL) \
183 Py_INCREF(unicode_empty); \
184 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185 unicode_empty = PyUnicode_New(0, 0); \
186 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
189 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200191 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193#define _Py_RETURN_UNICODE_EMPTY() \
194 do { \
195 _Py_INCREF_UNICODE_EMPTY(); \
196 return unicode_empty; \
197 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200199/* Forward declaration */
200Py_LOCAL_INLINE(int)
201_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
202
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200205
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206/* Single character Unicode strings in the Latin-1 range are being
207 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200208static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210/* Fast detection of the most frequent whitespace characters */
211const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000215/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000216/* case 0x000C: * FORM FEED */
217/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 1, 1, 1, 1, 1, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000220/* case 0x001C: * FILE SEPARATOR */
221/* case 0x001D: * GROUP SEPARATOR */
222/* case 0x001E: * RECORD SEPARATOR */
223/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000226 1, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000230
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000239};
240
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200243static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100244static int unicode_modifiable(PyObject *unicode);
245
Victor Stinnerfe226c02011-10-03 03:52:20 +0200246
Alexander Belopolsky40018472011-02-26 01:02:56 +0000247static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100248_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200249static PyObject *
250_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
251static PyObject *
252_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
253
254static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000255unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100257 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000258 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static void
261raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300262 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100263 PyObject *unicode,
264 Py_ssize_t startpos, Py_ssize_t endpos,
265 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000266
Christian Heimes190d79e2008-01-30 11:58:22 +0000267/* Same for linebreaks */
268static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000271/* 0x000B, * LINE TABULATION */
272/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000273/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000274 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000276/* 0x001C, * FILE SEPARATOR */
277/* 0x001D, * GROUP SEPARATOR */
278/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 1, 1, 1, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000284
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000293};
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Victor Stinner3a50e702011-10-18 21:21:00 +0200524#ifdef HAVE_MBCS
525static OSVERSIONINFOEX winver;
526#endif
527
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528/* --- Bloom Filters ----------------------------------------------------- */
529
530/* stuff to implement simple "bloom filters" for Unicode characters.
531 to keep things simple, we use a single bitmask, using the least 5
532 bits from each unicode characters as the bit index. */
533
534/* the linebreak mask is set up by Unicode_Init below */
535
Antoine Pitrouf068f942010-01-13 14:19:12 +0000536#if LONG_BIT >= 128
537#define BLOOM_WIDTH 128
538#elif LONG_BIT >= 64
539#define BLOOM_WIDTH 64
540#elif LONG_BIT >= 32
541#define BLOOM_WIDTH 32
542#else
543#error "LONG_BIT is smaller than 32"
544#endif
545
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546#define BLOOM_MASK unsigned long
547
Serhiy Storchaka05997252013-01-26 12:14:02 +0200548static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Antoine Pitrouf068f942010-01-13 14:19:12 +0000550#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Benjamin Peterson29060642009-01-31 22:14:21 +0000552#define BLOOM_LINEBREAK(ch) \
553 ((ch) < 128U ? ascii_linebreak[(ch)] : \
554 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Alexander Belopolsky40018472011-02-26 01:02:56 +0000556Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558{
Victor Stinnera85af502013-04-09 21:53:54 +0200559#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
560 do { \
561 TYPE *data = (TYPE *)PTR; \
562 TYPE *end = data + LEN; \
563 Py_UCS4 ch; \
564 for (; data != end; data++) { \
565 ch = *data; \
566 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
567 } \
568 break; \
569 } while (0)
570
Thomas Wouters477c8d52006-05-27 19:21:47 +0000571 /* calculate simple bloom-style bitmask for a given unicode string */
572
Antoine Pitrouf068f942010-01-13 14:19:12 +0000573 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000574
575 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200576 switch (kind) {
577 case PyUnicode_1BYTE_KIND:
578 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
579 break;
580 case PyUnicode_2BYTE_KIND:
581 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
582 break;
583 case PyUnicode_4BYTE_KIND:
584 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
585 break;
586 default:
587 assert(0);
588 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000589 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200590
591#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000592}
593
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200594/* Compilation of templated routines */
595
596#include "stringlib/asciilib.h"
597#include "stringlib/fastsearch.h"
598#include "stringlib/partition.h"
599#include "stringlib/split.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
602#include "stringlib/find_max_char.h"
603#include "stringlib/localeutil.h"
604#include "stringlib/undef.h"
605
606#include "stringlib/ucs1lib.h"
607#include "stringlib/fastsearch.h"
608#include "stringlib/partition.h"
609#include "stringlib/split.h"
610#include "stringlib/count.h"
611#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300612#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200613#include "stringlib/find_max_char.h"
614#include "stringlib/localeutil.h"
615#include "stringlib/undef.h"
616
617#include "stringlib/ucs2lib.h"
618#include "stringlib/fastsearch.h"
619#include "stringlib/partition.h"
620#include "stringlib/split.h"
621#include "stringlib/count.h"
622#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300623#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200624#include "stringlib/find_max_char.h"
625#include "stringlib/localeutil.h"
626#include "stringlib/undef.h"
627
628#include "stringlib/ucs4lib.h"
629#include "stringlib/fastsearch.h"
630#include "stringlib/partition.h"
631#include "stringlib/split.h"
632#include "stringlib/count.h"
633#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300634#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200635#include "stringlib/find_max_char.h"
636#include "stringlib/localeutil.h"
637#include "stringlib/undef.h"
638
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200639#include "stringlib/unicodedefs.h"
640#include "stringlib/fastsearch.h"
641#include "stringlib/count.h"
642#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100643#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645/* --- Unicode Object ----------------------------------------------------- */
646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200648fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
651 Py_ssize_t size, Py_UCS4 ch,
652 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200653{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200654 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
655
656 switch (kind) {
657 case PyUnicode_1BYTE_KIND:
658 {
659 Py_UCS1 ch1 = (Py_UCS1) ch;
660 if (ch1 == ch)
661 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
662 else
663 return -1;
664 }
665 case PyUnicode_2BYTE_KIND:
666 {
667 Py_UCS2 ch2 = (Py_UCS2) ch;
668 if (ch2 == ch)
669 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
670 else
671 return -1;
672 }
673 case PyUnicode_4BYTE_KIND:
674 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
675 default:
676 assert(0);
677 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679}
680
Victor Stinnerafffce42012-10-03 23:03:17 +0200681#ifdef Py_DEBUG
682/* Fill the data of an Unicode string with invalid characters to detect bugs
683 earlier.
684
685 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
686 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
687 invalid character in Unicode 6.0. */
688static void
689unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
690{
691 int kind = PyUnicode_KIND(unicode);
692 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
693 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
694 if (length <= old_length)
695 return;
696 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
697}
698#endif
699
Victor Stinnerfe226c02011-10-03 03:52:20 +0200700static PyObject*
701resize_compact(PyObject *unicode, Py_ssize_t length)
702{
703 Py_ssize_t char_size;
704 Py_ssize_t struct_size;
705 Py_ssize_t new_size;
706 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100707 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200708#ifdef Py_DEBUG
709 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
710#endif
711
Victor Stinner79891572012-05-03 13:43:07 +0200712 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100714 assert(PyUnicode_IS_COMPACT(unicode));
715
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200716 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100717 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 struct_size = sizeof(PyASCIIObject);
719 else
720 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200721 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
724 PyErr_NoMemory();
725 return NULL;
726 }
727 new_size = (struct_size + (length + 1) * char_size);
728
Victor Stinner84def372011-12-11 20:04:56 +0100729 _Py_DEC_REFTOTAL;
730 _Py_ForgetReference(unicode);
731
732 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
733 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100734 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 PyErr_NoMemory();
736 return NULL;
737 }
Victor Stinner84def372011-12-11 20:04:56 +0100738 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100740
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200742 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100744 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200745 _PyUnicode_WSTR_LENGTH(unicode) = length;
746 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100747 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
748 PyObject_DEL(_PyUnicode_WSTR(unicode));
749 _PyUnicode_WSTR(unicode) = NULL;
750 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200751#ifdef Py_DEBUG
752 unicode_fill_invalid(unicode, old_length);
753#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200754 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
755 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200757 return unicode;
758}
759
Alexander Belopolsky40018472011-02-26 01:02:56 +0000760static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200761resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762{
Victor Stinner95663112011-10-04 01:03:50 +0200763 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200765 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000767
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768 if (PyUnicode_IS_READY(unicode)) {
769 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200770 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200772#ifdef Py_DEBUG
773 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
774#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775
776 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200777 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200778 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
779 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780
781 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
782 PyErr_NoMemory();
783 return -1;
784 }
785 new_size = (length + 1) * char_size;
786
Victor Stinner7a9105a2011-12-12 00:13:42 +0100787 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
788 {
789 PyObject_DEL(_PyUnicode_UTF8(unicode));
790 _PyUnicode_UTF8(unicode) = NULL;
791 _PyUnicode_UTF8_LENGTH(unicode) = 0;
792 }
793
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 data = (PyObject *)PyObject_REALLOC(data, new_size);
795 if (data == NULL) {
796 PyErr_NoMemory();
797 return -1;
798 }
799 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200801 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200802 _PyUnicode_WSTR_LENGTH(unicode) = length;
803 }
804 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200805 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200806 _PyUnicode_UTF8_LENGTH(unicode) = length;
807 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808 _PyUnicode_LENGTH(unicode) = length;
809 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200810#ifdef Py_DEBUG
811 unicode_fill_invalid(unicode, old_length);
812#endif
Victor Stinner95663112011-10-04 01:03:50 +0200813 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200814 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200816 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200817 }
Victor Stinner95663112011-10-04 01:03:50 +0200818 assert(_PyUnicode_WSTR(unicode) != NULL);
819
820 /* check for integer overflow */
821 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
822 PyErr_NoMemory();
823 return -1;
824 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200826 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100827 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200828 if (!wstr) {
829 PyErr_NoMemory();
830 return -1;
831 }
832 _PyUnicode_WSTR(unicode) = wstr;
833 _PyUnicode_WSTR(unicode)[length] = 0;
834 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200835 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 return 0;
837}
838
Victor Stinnerfe226c02011-10-03 03:52:20 +0200839static PyObject*
840resize_copy(PyObject *unicode, Py_ssize_t length)
841{
842 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200844 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845
Benjamin Petersonbac79492012-01-14 13:34:47 -0500846 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100847 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200848
849 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
850 if (copy == NULL)
851 return NULL;
852
853 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200854 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200855 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200856 }
857 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100859
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200860 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200861 if (w == NULL)
862 return NULL;
863 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
864 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200865 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
866 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200867 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200868 }
869}
870
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000872 Ux0000 terminated; some code (e.g. new_identifier)
873 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874
875 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000876 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877
878*/
879
Alexander Belopolsky40018472011-02-26 01:02:56 +0000880static PyUnicodeObject *
881_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882{
883 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885
Thomas Wouters477c8d52006-05-27 19:21:47 +0000886 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000887 if (length == 0 && unicode_empty != NULL) {
888 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200889 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 }
891
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000892 /* Ensure we won't overflow the size. */
893 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
894 return (PyUnicodeObject *)PyErr_NoMemory();
895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896 if (length < 0) {
897 PyErr_SetString(PyExc_SystemError,
898 "Negative size passed to _PyUnicode_New");
899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900 }
901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
903 if (unicode == NULL)
904 return NULL;
905 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
906 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
907 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100908 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000909 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100910 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912
Jeremy Hyltond8082792003-09-16 19:41:39 +0000913 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000914 * the caller fails before initializing str -- unicode_resize()
915 * reads str[0], and the Keep-Alive optimization can keep memory
916 * allocated for str alive across a call to unicode_dealloc(unicode).
917 * We don't want unicode_resize to read uninitialized memory in
918 * that case.
919 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 _PyUnicode_WSTR(unicode)[0] = 0;
921 _PyUnicode_WSTR(unicode)[length] = 0;
922 _PyUnicode_WSTR_LENGTH(unicode) = length;
923 _PyUnicode_HASH(unicode) = -1;
924 _PyUnicode_STATE(unicode).interned = 0;
925 _PyUnicode_STATE(unicode).kind = 0;
926 _PyUnicode_STATE(unicode).compact = 0;
927 _PyUnicode_STATE(unicode).ready = 0;
928 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200929 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200930 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200931 _PyUnicode_UTF8(unicode) = NULL;
932 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100933 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 return unicode;
935}
936
Victor Stinnerf42dc442011-10-02 23:33:16 +0200937static const char*
938unicode_kind_name(PyObject *unicode)
939{
Victor Stinner42dfd712011-10-03 14:41:45 +0200940 /* don't check consistency: unicode_kind_name() is called from
941 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200942 if (!PyUnicode_IS_COMPACT(unicode))
943 {
944 if (!PyUnicode_IS_READY(unicode))
945 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600946 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200947 {
948 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200949 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200950 return "legacy ascii";
951 else
952 return "legacy latin1";
953 case PyUnicode_2BYTE_KIND:
954 return "legacy UCS2";
955 case PyUnicode_4BYTE_KIND:
956 return "legacy UCS4";
957 default:
958 return "<legacy invalid kind>";
959 }
960 }
961 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600962 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200963 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200965 return "ascii";
966 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200967 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200968 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200969 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200971 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200972 default:
973 return "<invalid compact kind>";
974 }
975}
976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978/* Functions wrapping macros for use in debugger */
979char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200980 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981}
982
983void *_PyUnicode_compact_data(void *unicode) {
984 return _PyUnicode_COMPACT_DATA(unicode);
985}
986void *_PyUnicode_data(void *unicode){
987 printf("obj %p\n", unicode);
988 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
989 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
990 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
991 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
992 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
993 return PyUnicode_DATA(unicode);
994}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200995
996void
997_PyUnicode_Dump(PyObject *op)
998{
999 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001000 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1001 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1002 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001003
Victor Stinnera849a4b2011-10-03 12:12:11 +02001004 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001005 {
1006 if (ascii->state.ascii)
1007 data = (ascii + 1);
1008 else
1009 data = (compact + 1);
1010 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001011 else
1012 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001013 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1014
Victor Stinnera849a4b2011-10-03 12:12:11 +02001015 if (ascii->wstr == data)
1016 printf("shared ");
1017 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001018
Victor Stinnera3b334d2011-10-03 13:53:37 +02001019 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001020 printf(" (%zu), ", compact->wstr_length);
1021 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1022 printf("shared ");
1023 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001024 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001025 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001026}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027#endif
1028
1029PyObject *
1030PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1031{
1032 PyObject *obj;
1033 PyCompactUnicodeObject *unicode;
1034 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001035 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001036 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 Py_ssize_t char_size;
1038 Py_ssize_t struct_size;
1039
1040 /* Optimization for empty strings */
1041 if (size == 0 && unicode_empty != NULL) {
1042 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001043 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 }
1045
Victor Stinner9e9d6892011-10-04 01:02:02 +02001046 is_ascii = 0;
1047 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 struct_size = sizeof(PyCompactUnicodeObject);
1049 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001050 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 char_size = 1;
1052 is_ascii = 1;
1053 struct_size = sizeof(PyASCIIObject);
1054 }
1055 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001056 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 char_size = 1;
1058 }
1059 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001060 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 char_size = 2;
1062 if (sizeof(wchar_t) == 2)
1063 is_sharing = 1;
1064 }
1065 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001066 if (maxchar > MAX_UNICODE) {
1067 PyErr_SetString(PyExc_SystemError,
1068 "invalid maximum character passed to PyUnicode_New");
1069 return NULL;
1070 }
Victor Stinner8f825062012-04-27 13:55:39 +02001071 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 char_size = 4;
1073 if (sizeof(wchar_t) == 4)
1074 is_sharing = 1;
1075 }
1076
1077 /* Ensure we won't overflow the size. */
1078 if (size < 0) {
1079 PyErr_SetString(PyExc_SystemError,
1080 "Negative size passed to PyUnicode_New");
1081 return NULL;
1082 }
1083 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1084 return PyErr_NoMemory();
1085
1086 /* Duplicated allocation code from _PyObject_New() instead of a call to
1087 * PyObject_New() so we are able to allocate space for the object and
1088 * it's data buffer.
1089 */
1090 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1091 if (obj == NULL)
1092 return PyErr_NoMemory();
1093 obj = PyObject_INIT(obj, &PyUnicode_Type);
1094 if (obj == NULL)
1095 return NULL;
1096
1097 unicode = (PyCompactUnicodeObject *)obj;
1098 if (is_ascii)
1099 data = ((PyASCIIObject*)obj) + 1;
1100 else
1101 data = unicode + 1;
1102 _PyUnicode_LENGTH(unicode) = size;
1103 _PyUnicode_HASH(unicode) = -1;
1104 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001105 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 _PyUnicode_STATE(unicode).compact = 1;
1107 _PyUnicode_STATE(unicode).ready = 1;
1108 _PyUnicode_STATE(unicode).ascii = is_ascii;
1109 if (is_ascii) {
1110 ((char*)data)[size] = 0;
1111 _PyUnicode_WSTR(unicode) = NULL;
1112 }
Victor Stinner8f825062012-04-27 13:55:39 +02001113 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 ((char*)data)[size] = 0;
1115 _PyUnicode_WSTR(unicode) = NULL;
1116 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001118 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 else {
1121 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001122 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001123 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001125 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 ((Py_UCS4*)data)[size] = 0;
1127 if (is_sharing) {
1128 _PyUnicode_WSTR_LENGTH(unicode) = size;
1129 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1130 }
1131 else {
1132 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1133 _PyUnicode_WSTR(unicode) = NULL;
1134 }
1135 }
Victor Stinner8f825062012-04-27 13:55:39 +02001136#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001137 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001138#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001139 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140 return obj;
1141}
1142
1143#if SIZEOF_WCHAR_T == 2
1144/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1145 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001146 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
1148 This function assumes that unicode can hold one more code point than wstr
1149 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001150static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001152 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153{
1154 const wchar_t *iter;
1155 Py_UCS4 *ucs4_out;
1156
Victor Stinner910337b2011-10-03 03:20:16 +02001157 assert(unicode != NULL);
1158 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1160 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1161
1162 for (iter = begin; iter < end; ) {
1163 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1164 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001165 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1166 && (iter+1) < end
1167 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 {
Victor Stinner551ac952011-11-29 22:58:13 +01001169 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170 iter += 2;
1171 }
1172 else {
1173 *ucs4_out++ = *iter;
1174 iter++;
1175 }
1176 }
1177 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1178 _PyUnicode_GET_LENGTH(unicode)));
1179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180}
1181#endif
1182
Victor Stinnercd9950f2011-10-02 00:34:53 +02001183static int
Victor Stinner488fa492011-12-12 00:01:39 +01001184unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001185{
Victor Stinner488fa492011-12-12 00:01:39 +01001186 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001187 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001188 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001189 return -1;
1190 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001191 return 0;
1192}
1193
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001194static int
1195_copy_characters(PyObject *to, Py_ssize_t to_start,
1196 PyObject *from, Py_ssize_t from_start,
1197 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 unsigned int from_kind, to_kind;
1200 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201
Victor Stinneree4544c2012-05-09 22:24:08 +02001202 assert(0 <= how_many);
1203 assert(0 <= from_start);
1204 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001205 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001207 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208
Victor Stinnerd3f08822012-05-29 12:57:52 +02001209 assert(PyUnicode_Check(to));
1210 assert(PyUnicode_IS_READY(to));
1211 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1212
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001213 if (how_many == 0)
1214 return 0;
1215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001217 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220
Victor Stinnerf1852262012-06-16 16:38:26 +02001221#ifdef Py_DEBUG
1222 if (!check_maxchar
1223 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1224 {
1225 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1226 Py_UCS4 ch;
1227 Py_ssize_t i;
1228 for (i=0; i < how_many; i++) {
1229 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1230 assert(ch <= to_maxchar);
1231 }
1232 }
1233#endif
1234
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001236 if (check_maxchar
1237 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1238 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001239 /* Writing Latin-1 characters into an ASCII string requires to
1240 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001241 Py_UCS4 max_char;
1242 max_char = ucs1lib_find_max_char(from_data,
1243 (Py_UCS1*)from_data + how_many);
1244 if (max_char >= 128)
1245 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001246 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001247 Py_MEMCPY((char*)to_data + to_kind * to_start,
1248 (char*)from_data + from_kind * from_start,
1249 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001251 else if (from_kind == PyUnicode_1BYTE_KIND
1252 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001253 {
1254 _PyUnicode_CONVERT_BYTES(
1255 Py_UCS1, Py_UCS2,
1256 PyUnicode_1BYTE_DATA(from) + from_start,
1257 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1258 PyUnicode_2BYTE_DATA(to) + to_start
1259 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001260 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001261 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001262 && to_kind == PyUnicode_4BYTE_KIND)
1263 {
1264 _PyUnicode_CONVERT_BYTES(
1265 Py_UCS1, Py_UCS4,
1266 PyUnicode_1BYTE_DATA(from) + from_start,
1267 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1268 PyUnicode_4BYTE_DATA(to) + to_start
1269 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001270 }
1271 else if (from_kind == PyUnicode_2BYTE_KIND
1272 && to_kind == PyUnicode_4BYTE_KIND)
1273 {
1274 _PyUnicode_CONVERT_BYTES(
1275 Py_UCS2, Py_UCS4,
1276 PyUnicode_2BYTE_DATA(from) + from_start,
1277 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1278 PyUnicode_4BYTE_DATA(to) + to_start
1279 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001280 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001282 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1283
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001284 if (!check_maxchar) {
1285 if (from_kind == PyUnicode_2BYTE_KIND
1286 && to_kind == PyUnicode_1BYTE_KIND)
1287 {
1288 _PyUnicode_CONVERT_BYTES(
1289 Py_UCS2, Py_UCS1,
1290 PyUnicode_2BYTE_DATA(from) + from_start,
1291 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1292 PyUnicode_1BYTE_DATA(to) + to_start
1293 );
1294 }
1295 else if (from_kind == PyUnicode_4BYTE_KIND
1296 && to_kind == PyUnicode_1BYTE_KIND)
1297 {
1298 _PyUnicode_CONVERT_BYTES(
1299 Py_UCS4, Py_UCS1,
1300 PyUnicode_4BYTE_DATA(from) + from_start,
1301 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1302 PyUnicode_1BYTE_DATA(to) + to_start
1303 );
1304 }
1305 else if (from_kind == PyUnicode_4BYTE_KIND
1306 && to_kind == PyUnicode_2BYTE_KIND)
1307 {
1308 _PyUnicode_CONVERT_BYTES(
1309 Py_UCS4, Py_UCS2,
1310 PyUnicode_4BYTE_DATA(from) + from_start,
1311 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1312 PyUnicode_2BYTE_DATA(to) + to_start
1313 );
1314 }
1315 else {
1316 assert(0);
1317 return -1;
1318 }
1319 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001320 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001321 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001322 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001323 Py_ssize_t i;
1324
Victor Stinnera0702ab2011-09-29 14:14:38 +02001325 for (i=0; i < how_many; i++) {
1326 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001327 if (ch > to_maxchar)
1328 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001329 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1330 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001331 }
1332 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001333 return 0;
1334}
1335
Victor Stinnerd3f08822012-05-29 12:57:52 +02001336void
1337_PyUnicode_FastCopyCharacters(
1338 PyObject *to, Py_ssize_t to_start,
1339 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001340{
1341 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1342}
1343
1344Py_ssize_t
1345PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1346 PyObject *from, Py_ssize_t from_start,
1347 Py_ssize_t how_many)
1348{
1349 int err;
1350
1351 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1352 PyErr_BadInternalCall();
1353 return -1;
1354 }
1355
Benjamin Petersonbac79492012-01-14 13:34:47 -05001356 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001357 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001358 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001359 return -1;
1360
Victor Stinnerd3f08822012-05-29 12:57:52 +02001361 if (from_start < 0) {
1362 PyErr_SetString(PyExc_IndexError, "string index out of range");
1363 return -1;
1364 }
1365 if (to_start < 0) {
1366 PyErr_SetString(PyExc_IndexError, "string index out of range");
1367 return -1;
1368 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001369 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1370 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1371 PyErr_Format(PyExc_SystemError,
1372 "Cannot write %zi characters at %zi "
1373 "in a string of %zi characters",
1374 how_many, to_start, PyUnicode_GET_LENGTH(to));
1375 return -1;
1376 }
1377
1378 if (how_many == 0)
1379 return 0;
1380
Victor Stinner488fa492011-12-12 00:01:39 +01001381 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001382 return -1;
1383
1384 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1385 if (err) {
1386 PyErr_Format(PyExc_SystemError,
1387 "Cannot copy %s characters "
1388 "into a string of %s characters",
1389 unicode_kind_name(from),
1390 unicode_kind_name(to));
1391 return -1;
1392 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001393 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394}
1395
Victor Stinner17222162011-09-28 22:15:37 +02001396/* Find the maximum code point and count the number of surrogate pairs so a
1397 correct string length can be computed before converting a string to UCS4.
1398 This function counts single surrogates as a character and not as a pair.
1399
1400 Return 0 on success, or -1 on error. */
1401static int
1402find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1403 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
1405 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001406 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinnerc53be962011-10-02 21:33:54 +02001408 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 *num_surrogates = 0;
1410 *maxchar = 0;
1411
1412 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001414 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1415 && (iter+1) < end
1416 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1417 {
1418 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1419 ++(*num_surrogates);
1420 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 }
1422 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001424 {
1425 ch = *iter;
1426 iter++;
1427 }
1428 if (ch > *maxchar) {
1429 *maxchar = ch;
1430 if (*maxchar > MAX_UNICODE) {
1431 PyErr_Format(PyExc_ValueError,
1432 "character U+%x is not in range [U+0000; U+10ffff]",
1433 ch);
1434 return -1;
1435 }
1436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 return 0;
1439}
1440
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001441int
1442_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443{
1444 wchar_t *end;
1445 Py_UCS4 maxchar = 0;
1446 Py_ssize_t num_surrogates;
1447#if SIZEOF_WCHAR_T == 2
1448 Py_ssize_t length_wo_surrogates;
1449#endif
1450
Georg Brandl7597add2011-10-05 16:36:47 +02001451 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001452 strings were created using _PyObject_New() and where no canonical
1453 representation (the str field) has been set yet aka strings
1454 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001455 assert(_PyUnicode_CHECK(unicode));
1456 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001459 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001460 /* Actually, it should neither be interned nor be anything else: */
1461 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001464 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001465 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467
1468 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001469 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1470 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 PyErr_NoMemory();
1472 return -1;
1473 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001474 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 _PyUnicode_WSTR(unicode), end,
1476 PyUnicode_1BYTE_DATA(unicode));
1477 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1478 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1479 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1480 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001481 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001482 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001483 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 }
1485 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001486 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001487 _PyUnicode_UTF8(unicode) = NULL;
1488 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 }
1490 PyObject_FREE(_PyUnicode_WSTR(unicode));
1491 _PyUnicode_WSTR(unicode) = NULL;
1492 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1493 }
1494 /* In this case we might have to convert down from 4-byte native
1495 wchar_t to 2-byte unicode. */
1496 else if (maxchar < 65536) {
1497 assert(num_surrogates == 0 &&
1498 "FindMaxCharAndNumSurrogatePairs() messed up");
1499
Victor Stinner506f5922011-09-28 22:34:18 +02001500#if SIZEOF_WCHAR_T == 2
1501 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001502 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001503 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1505 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001506 _PyUnicode_UTF8(unicode) = NULL;
1507 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001508#else
1509 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001510 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001511 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001512 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001513 PyErr_NoMemory();
1514 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 }
Victor Stinner506f5922011-09-28 22:34:18 +02001516 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1517 _PyUnicode_WSTR(unicode), end,
1518 PyUnicode_2BYTE_DATA(unicode));
1519 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1520 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1521 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001522 _PyUnicode_UTF8(unicode) = NULL;
1523 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001524 PyObject_FREE(_PyUnicode_WSTR(unicode));
1525 _PyUnicode_WSTR(unicode) = NULL;
1526 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1527#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528 }
1529 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1530 else {
1531#if SIZEOF_WCHAR_T == 2
1532 /* in case the native representation is 2-bytes, we need to allocate a
1533 new normalized 4-byte version. */
1534 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001535 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1536 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 PyErr_NoMemory();
1538 return -1;
1539 }
1540 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1541 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001542 _PyUnicode_UTF8(unicode) = NULL;
1543 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001544 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1545 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001546 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 PyObject_FREE(_PyUnicode_WSTR(unicode));
1548 _PyUnicode_WSTR(unicode) = NULL;
1549 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1550#else
1551 assert(num_surrogates == 0);
1552
Victor Stinnerc3c74152011-10-02 20:39:55 +02001553 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001555 _PyUnicode_UTF8(unicode) = NULL;
1556 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1558#endif
1559 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1560 }
1561 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001562 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 return 0;
1564}
1565
Alexander Belopolsky40018472011-02-26 01:02:56 +00001566static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001567unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568{
Walter Dörwald16807132007-05-25 13:52:07 +00001569 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 case SSTATE_NOT_INTERNED:
1571 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001572
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 case SSTATE_INTERNED_MORTAL:
1574 /* revive dead object temporarily for DelItem */
1575 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001576 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 Py_FatalError(
1578 "deletion of interned string failed");
1579 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001580
Benjamin Peterson29060642009-01-31 22:14:21 +00001581 case SSTATE_INTERNED_IMMORTAL:
1582 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001583
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 default:
1585 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001586 }
1587
Victor Stinner03490912011-10-03 23:45:12 +02001588 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001590 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001591 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001592 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1593 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001595 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596}
1597
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001598#ifdef Py_DEBUG
1599static int
1600unicode_is_singleton(PyObject *unicode)
1601{
1602 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1603 if (unicode == unicode_empty)
1604 return 1;
1605 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1606 {
1607 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1608 if (ch < 256 && unicode_latin1[ch] == unicode)
1609 return 1;
1610 }
1611 return 0;
1612}
1613#endif
1614
Alexander Belopolsky40018472011-02-26 01:02:56 +00001615static int
Victor Stinner488fa492011-12-12 00:01:39 +01001616unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001617{
Victor Stinner488fa492011-12-12 00:01:39 +01001618 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001619 if (Py_REFCNT(unicode) != 1)
1620 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001621 if (_PyUnicode_HASH(unicode) != -1)
1622 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 if (PyUnicode_CHECK_INTERNED(unicode))
1624 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001625 if (!PyUnicode_CheckExact(unicode))
1626 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001627#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001628 /* singleton refcount is greater than 1 */
1629 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001630#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 return 1;
1632}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001633
Victor Stinnerfe226c02011-10-03 03:52:20 +02001634static int
1635unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1636{
1637 PyObject *unicode;
1638 Py_ssize_t old_length;
1639
1640 assert(p_unicode != NULL);
1641 unicode = *p_unicode;
1642
1643 assert(unicode != NULL);
1644 assert(PyUnicode_Check(unicode));
1645 assert(0 <= length);
1646
Victor Stinner910337b2011-10-03 03:20:16 +02001647 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001648 old_length = PyUnicode_WSTR_LENGTH(unicode);
1649 else
1650 old_length = PyUnicode_GET_LENGTH(unicode);
1651 if (old_length == length)
1652 return 0;
1653
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001655 _Py_INCREF_UNICODE_EMPTY();
1656 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001657 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001658 Py_DECREF(*p_unicode);
1659 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001660 return 0;
1661 }
1662
Victor Stinner488fa492011-12-12 00:01:39 +01001663 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001664 PyObject *copy = resize_copy(unicode, length);
1665 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001666 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001667 Py_DECREF(*p_unicode);
1668 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001670 }
1671
Victor Stinnerfe226c02011-10-03 03:52:20 +02001672 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001673 PyObject *new_unicode = resize_compact(unicode, length);
1674 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001675 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001676 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001677 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001678 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001679 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001680}
1681
Alexander Belopolsky40018472011-02-26 01:02:56 +00001682int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001683PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001684{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001685 PyObject *unicode;
1686 if (p_unicode == NULL) {
1687 PyErr_BadInternalCall();
1688 return -1;
1689 }
1690 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001691 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001692 {
1693 PyErr_BadInternalCall();
1694 return -1;
1695 }
1696 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001697}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001698
Victor Stinnerc5166102012-02-22 13:55:02 +01001699/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001700
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001701 WARNING: The function doesn't copy the terminating null character and
1702 doesn't check the maximum character (may write a latin1 character in an
1703 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001704static void
1705unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1706 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001707{
1708 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1709 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001710 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001711
1712 switch (kind) {
1713 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001714 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001715#ifdef Py_DEBUG
1716 if (PyUnicode_IS_ASCII(unicode)) {
1717 Py_UCS4 maxchar = ucs1lib_find_max_char(
1718 (const Py_UCS1*)str,
1719 (const Py_UCS1*)str + len);
1720 assert(maxchar < 128);
1721 }
1722#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001723 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001724 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001725 }
1726 case PyUnicode_2BYTE_KIND: {
1727 Py_UCS2 *start = (Py_UCS2 *)data + index;
1728 Py_UCS2 *ucs2 = start;
1729 assert(index <= PyUnicode_GET_LENGTH(unicode));
1730
Victor Stinner184252a2012-06-16 02:57:41 +02001731 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 *ucs2 = (Py_UCS2)*str;
1733
1734 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001735 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001736 }
1737 default: {
1738 Py_UCS4 *start = (Py_UCS4 *)data + index;
1739 Py_UCS4 *ucs4 = start;
1740 assert(kind == PyUnicode_4BYTE_KIND);
1741 assert(index <= PyUnicode_GET_LENGTH(unicode));
1742
Victor Stinner184252a2012-06-16 02:57:41 +02001743 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001744 *ucs4 = (Py_UCS4)*str;
1745
1746 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001747 }
1748 }
1749}
1750
1751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752static PyObject*
1753get_latin1_char(unsigned char ch)
1754{
Victor Stinnera464fc12011-10-02 20:39:30 +02001755 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode)
1759 return NULL;
1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 unicode_latin1[ch] = unicode;
1763 }
1764 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001765 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Alexander Belopolsky40018472011-02-26 01:02:56 +00001768PyObject *
1769PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001771 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 Py_UCS4 maxchar = 0;
1773 Py_ssize_t num_surrogates;
1774
1775 if (u == NULL)
1776 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001778 /* If the Unicode data is known at construction time, we can apply
1779 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001782 if (size == 0)
1783 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 /* Single character Unicode objects in the Latin-1 range are
1786 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001787 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 return get_latin1_char((unsigned char)*u);
1789
1790 /* If not empty and not single character, copy the Unicode data
1791 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001792 if (find_maxchar_surrogates(u, u + size,
1793 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 return NULL;
1795
Victor Stinner8faf8212011-12-08 22:14:11 +01001796 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 if (!unicode)
1798 return NULL;
1799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 switch (PyUnicode_KIND(unicode)) {
1801 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001802 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1804 break;
1805 case PyUnicode_2BYTE_KIND:
1806#if Py_UNICODE_SIZE == 2
1807 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1808#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001809 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1811#endif
1812 break;
1813 case PyUnicode_4BYTE_KIND:
1814#if SIZEOF_WCHAR_T == 2
1815 /* This is the only case which has to process surrogates, thus
1816 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001817 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818#else
1819 assert(num_surrogates == 0);
1820 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1821#endif
1822 break;
1823 default:
1824 assert(0 && "Impossible state");
1825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001827 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828}
1829
Alexander Belopolsky40018472011-02-26 01:02:56 +00001830PyObject *
1831PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001832{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001833 if (size < 0) {
1834 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001835 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 return NULL;
1837 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001838 if (u != NULL)
1839 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1840 else
1841 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001842}
1843
Alexander Belopolsky40018472011-02-26 01:02:56 +00001844PyObject *
1845PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001846{
1847 size_t size = strlen(u);
1848 if (size > PY_SSIZE_T_MAX) {
1849 PyErr_SetString(PyExc_OverflowError, "input too long");
1850 return NULL;
1851 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001852 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001853}
1854
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855PyObject *
1856_PyUnicode_FromId(_Py_Identifier *id)
1857{
1858 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001859 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1860 strlen(id->string),
1861 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001862 if (!id->object)
1863 return NULL;
1864 PyUnicode_InternInPlace(&id->object);
1865 assert(!id->next);
1866 id->next = static_strings;
1867 static_strings = id;
1868 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001869 return id->object;
1870}
1871
1872void
1873_PyUnicode_ClearStaticStrings()
1874{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001875 _Py_Identifier *tmp, *s = static_strings;
1876 while (s) {
1877 Py_DECREF(s->object);
1878 s->object = NULL;
1879 tmp = s->next;
1880 s->next = NULL;
1881 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001882 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001883 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001884}
1885
Benjamin Peterson0df54292012-03-26 14:50:32 -04001886/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001887
Victor Stinnerd3f08822012-05-29 12:57:52 +02001888PyObject*
1889_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001890{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001891 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001892 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001893 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001894#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001895 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001896#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001897 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001898 }
Victor Stinner785938e2011-12-11 20:09:03 +01001899 unicode = PyUnicode_New(size, 127);
1900 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001901 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001902 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1903 assert(_PyUnicode_CheckConsistency(unicode, 1));
1904 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001905}
1906
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907static Py_UCS4
1908kind_maxchar_limit(unsigned int kind)
1909{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001910 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001911 case PyUnicode_1BYTE_KIND:
1912 return 0x80;
1913 case PyUnicode_2BYTE_KIND:
1914 return 0x100;
1915 case PyUnicode_4BYTE_KIND:
1916 return 0x10000;
1917 default:
1918 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001919 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001920 }
1921}
1922
Victor Stinnere6abb482012-05-02 01:15:40 +02001923Py_LOCAL_INLINE(Py_UCS4)
1924align_maxchar(Py_UCS4 maxchar)
1925{
1926 if (maxchar <= 127)
1927 return 127;
1928 else if (maxchar <= 255)
1929 return 255;
1930 else if (maxchar <= 65535)
1931 return 65535;
1932 else
1933 return MAX_UNICODE;
1934}
1935
Victor Stinner702c7342011-10-05 13:50:52 +02001936static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001937_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001940 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001941
Serhiy Storchaka678db842013-01-26 12:16:36 +02001942 if (size == 0)
1943 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001944 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001945 if (size == 1)
1946 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001947
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001948 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001949 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 if (!res)
1951 return NULL;
1952 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001953 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001955}
1956
Victor Stinnere57b1c02011-09-28 22:20:48 +02001957static PyObject*
1958_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959{
1960 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001961 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001962
Serhiy Storchaka678db842013-01-26 12:16:36 +02001963 if (size == 0)
1964 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001965 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001966 if (size == 1) {
1967 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001968 int kind;
1969 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001970 if (ch < 256)
1971 return get_latin1_char((unsigned char)ch);
1972
1973 res = PyUnicode_New(1, ch);
1974 if (res == NULL)
1975 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001976 kind = PyUnicode_KIND(res);
1977 data = PyUnicode_DATA(res);
1978 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001979 assert(_PyUnicode_CheckConsistency(res, 1));
1980 return res;
1981 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001982
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001983 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001984 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 if (!res)
1986 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001987 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001989 else {
1990 _PyUnicode_CONVERT_BYTES(
1991 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1992 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001993 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 return res;
1995}
1996
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997static PyObject*
1998_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999{
2000 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002
Serhiy Storchaka678db842013-01-26 12:16:36 +02002003 if (size == 0)
2004 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002005 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002006 if (size == 1) {
2007 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002008 int kind;
2009 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002010 if (ch < 256)
2011 return get_latin1_char((unsigned char)ch);
2012
2013 res = PyUnicode_New(1, ch);
2014 if (res == NULL)
2015 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002016 kind = PyUnicode_KIND(res);
2017 data = PyUnicode_DATA(res);
2018 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002019 assert(_PyUnicode_CheckConsistency(res, 1));
2020 return res;
2021 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002022
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002023 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002024 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 if (!res)
2026 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002027 if (max_char < 256)
2028 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2029 PyUnicode_1BYTE_DATA(res));
2030 else if (max_char < 0x10000)
2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2032 PyUnicode_2BYTE_DATA(res));
2033 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002035 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 return res;
2037}
2038
2039PyObject*
2040PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2041{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002042 if (size < 0) {
2043 PyErr_SetString(PyExc_ValueError, "size must be positive");
2044 return NULL;
2045 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002046 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002048 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002050 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002052 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002053 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002054 PyErr_SetString(PyExc_SystemError, "invalid kind");
2055 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057}
2058
Victor Stinnerece58de2012-04-23 23:36:38 +02002059Py_UCS4
2060_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2061{
2062 enum PyUnicode_Kind kind;
2063 void *startptr, *endptr;
2064
2065 assert(PyUnicode_IS_READY(unicode));
2066 assert(0 <= start);
2067 assert(end <= PyUnicode_GET_LENGTH(unicode));
2068 assert(start <= end);
2069
2070 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2071 return PyUnicode_MAX_CHAR_VALUE(unicode);
2072
2073 if (start == end)
2074 return 127;
2075
Victor Stinner94d558b2012-04-27 22:26:58 +02002076 if (PyUnicode_IS_ASCII(unicode))
2077 return 127;
2078
Victor Stinnerece58de2012-04-23 23:36:38 +02002079 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002080 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002081 endptr = (char *)startptr + end * kind;
2082 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002083 switch(kind) {
2084 case PyUnicode_1BYTE_KIND:
2085 return ucs1lib_find_max_char(startptr, endptr);
2086 case PyUnicode_2BYTE_KIND:
2087 return ucs2lib_find_max_char(startptr, endptr);
2088 case PyUnicode_4BYTE_KIND:
2089 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002090 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002091 assert(0);
2092 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002093 }
2094}
2095
Victor Stinner25a4b292011-10-06 12:31:55 +02002096/* Ensure that a string uses the most efficient storage, if it is not the
2097 case: create a new string with of the right kind. Write NULL into *p_unicode
2098 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002099static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002100unicode_adjust_maxchar(PyObject **p_unicode)
2101{
2102 PyObject *unicode, *copy;
2103 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002104 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002105 unsigned int kind;
2106
2107 assert(p_unicode != NULL);
2108 unicode = *p_unicode;
2109 assert(PyUnicode_IS_READY(unicode));
2110 if (PyUnicode_IS_ASCII(unicode))
2111 return;
2112
2113 len = PyUnicode_GET_LENGTH(unicode);
2114 kind = PyUnicode_KIND(unicode);
2115 if (kind == PyUnicode_1BYTE_KIND) {
2116 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002117 max_char = ucs1lib_find_max_char(u, u + len);
2118 if (max_char >= 128)
2119 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002120 }
2121 else if (kind == PyUnicode_2BYTE_KIND) {
2122 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002123 max_char = ucs2lib_find_max_char(u, u + len);
2124 if (max_char >= 256)
2125 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002126 }
2127 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002128 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002130 max_char = ucs4lib_find_max_char(u, u + len);
2131 if (max_char >= 0x10000)
2132 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002135 if (copy != NULL)
2136 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 Py_DECREF(unicode);
2138 *p_unicode = copy;
2139}
2140
Victor Stinner034f6cf2011-09-30 02:26:44 +02002141PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002142_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002143{
Victor Stinner87af4f22011-11-21 23:03:47 +01002144 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146
Victor Stinner034f6cf2011-09-30 02:26:44 +02002147 if (!PyUnicode_Check(unicode)) {
2148 PyErr_BadInternalCall();
2149 return NULL;
2150 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002151 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002152 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002153
Victor Stinner87af4f22011-11-21 23:03:47 +01002154 length = PyUnicode_GET_LENGTH(unicode);
2155 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156 if (!copy)
2157 return NULL;
2158 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2159
Victor Stinner87af4f22011-11-21 23:03:47 +01002160 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2161 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002162 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002163 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002164}
2165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167/* Widen Unicode objects to larger buffers. Don't write terminating null
2168 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169
2170void*
2171_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2172{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002173 Py_ssize_t len;
2174 void *result;
2175 unsigned int skind;
2176
Benjamin Petersonbac79492012-01-14 13:34:47 -05002177 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002178 return NULL;
2179
2180 len = PyUnicode_GET_LENGTH(s);
2181 skind = PyUnicode_KIND(s);
2182 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002183 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 return NULL;
2185 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002186 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002187 case PyUnicode_2BYTE_KIND:
2188 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2189 if (!result)
2190 return PyErr_NoMemory();
2191 assert(skind == PyUnicode_1BYTE_KIND);
2192 _PyUnicode_CONVERT_BYTES(
2193 Py_UCS1, Py_UCS2,
2194 PyUnicode_1BYTE_DATA(s),
2195 PyUnicode_1BYTE_DATA(s) + len,
2196 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002198 case PyUnicode_4BYTE_KIND:
2199 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2200 if (!result)
2201 return PyErr_NoMemory();
2202 if (skind == PyUnicode_2BYTE_KIND) {
2203 _PyUnicode_CONVERT_BYTES(
2204 Py_UCS2, Py_UCS4,
2205 PyUnicode_2BYTE_DATA(s),
2206 PyUnicode_2BYTE_DATA(s) + len,
2207 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002209 else {
2210 assert(skind == PyUnicode_1BYTE_KIND);
2211 _PyUnicode_CONVERT_BYTES(
2212 Py_UCS1, Py_UCS4,
2213 PyUnicode_1BYTE_DATA(s),
2214 PyUnicode_1BYTE_DATA(s) + len,
2215 result);
2216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002218 default:
2219 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 }
Victor Stinner01698042011-10-04 00:04:26 +02002221 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 return NULL;
2223}
2224
2225static Py_UCS4*
2226as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2227 int copy_null)
2228{
2229 int kind;
2230 void *data;
2231 Py_ssize_t len, targetlen;
2232 if (PyUnicode_READY(string) == -1)
2233 return NULL;
2234 kind = PyUnicode_KIND(string);
2235 data = PyUnicode_DATA(string);
2236 len = PyUnicode_GET_LENGTH(string);
2237 targetlen = len;
2238 if (copy_null)
2239 targetlen++;
2240 if (!target) {
2241 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2242 PyErr_NoMemory();
2243 return NULL;
2244 }
2245 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2246 if (!target) {
2247 PyErr_NoMemory();
2248 return NULL;
2249 }
2250 }
2251 else {
2252 if (targetsize < targetlen) {
2253 PyErr_Format(PyExc_SystemError,
2254 "string is longer than the buffer");
2255 if (copy_null && 0 < targetsize)
2256 target[0] = 0;
2257 return NULL;
2258 }
2259 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 if (kind == PyUnicode_1BYTE_KIND) {
2261 Py_UCS1 *start = (Py_UCS1 *) data;
2262 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002264 else if (kind == PyUnicode_2BYTE_KIND) {
2265 Py_UCS2 *start = (Py_UCS2 *) data;
2266 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2267 }
2268 else {
2269 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 if (copy_null)
2273 target[len] = 0;
2274 return target;
2275}
2276
2277Py_UCS4*
2278PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2279 int copy_null)
2280{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002281 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 PyErr_BadInternalCall();
2283 return NULL;
2284 }
2285 return as_ucs4(string, target, targetsize, copy_null);
2286}
2287
2288Py_UCS4*
2289PyUnicode_AsUCS4Copy(PyObject *string)
2290{
2291 return as_ucs4(string, NULL, 0, 1);
2292}
2293
2294#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002295
Alexander Belopolsky40018472011-02-26 01:02:56 +00002296PyObject *
2297PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002301 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 PyErr_BadInternalCall();
2303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 }
2305
Martin v. Löwis790465f2008-04-05 20:41:37 +00002306 if (size == -1) {
2307 size = wcslen(w);
2308 }
2309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311}
2312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002314
Walter Dörwald346737f2007-05-31 10:44:43 +00002315static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002316makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002317 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002318{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 if (longflag)
2321 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002322 else if (longlongflag) {
2323 /* longlongflag should only ever be nonzero on machines with
2324 HAVE_LONG_LONG defined */
2325#ifdef HAVE_LONG_LONG
2326 char *f = PY_FORMAT_LONG_LONG;
2327 while (*f)
2328 *fmt++ = *f++;
2329#else
2330 /* we shouldn't ever get here */
2331 assert(0);
2332 *fmt++ = 'l';
2333#endif
2334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 else if (size_tflag) {
2336 char *f = PY_FORMAT_SIZE_T;
2337 while (*f)
2338 *fmt++ = *f++;
2339 }
2340 *fmt++ = c;
2341 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002342}
2343
Victor Stinner15a11362012-10-06 23:48:20 +02002344/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002345 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2346 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2347#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002348
2349static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002350unicode_fromformat_arg(_PyUnicodeWriter *writer,
2351 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002352{
Victor Stinnere215d962012-10-06 23:03:36 +02002353 const char *p;
2354 Py_ssize_t len;
2355 int zeropad;
2356 int width;
2357 int precision;
2358 int longflag;
2359 int longlongflag;
2360 int size_tflag;
2361 int fill;
2362
2363 p = f;
2364 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002365 zeropad = 0;
2366 if (*f == '0') {
2367 zeropad = 1;
2368 f++;
2369 }
Victor Stinner96865452011-03-01 23:44:09 +00002370
2371 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002372 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002373 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002374 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2375 PyErr_SetString(PyExc_ValueError,
2376 "width too big");
2377 return NULL;
2378 }
Victor Stinnere215d962012-10-06 23:03:36 +02002379 width = (width*10) + (*f - '0');
2380 f++;
2381 }
Victor Stinner96865452011-03-01 23:44:09 +00002382 precision = 0;
2383 if (*f == '.') {
2384 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002385 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002386 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2387 PyErr_SetString(PyExc_ValueError,
2388 "precision too big");
2389 return NULL;
2390 }
Victor Stinnere215d962012-10-06 23:03:36 +02002391 precision = (precision*10) + (*f - '0');
2392 f++;
2393 }
Victor Stinner96865452011-03-01 23:44:09 +00002394 if (*f == '%') {
2395 /* "%.3%s" => f points to "3" */
2396 f--;
2397 }
2398 }
2399 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002400 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002401 f--;
2402 }
Victor Stinner96865452011-03-01 23:44:09 +00002403
2404 /* Handle %ld, %lu, %lld and %llu. */
2405 longflag = 0;
2406 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002407 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002408 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002409 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002410 longflag = 1;
2411 ++f;
2412 }
2413#ifdef HAVE_LONG_LONG
2414 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002415 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002416 longlongflag = 1;
2417 f += 2;
2418 }
2419#endif
2420 }
2421 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002422 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002423 size_tflag = 1;
2424 ++f;
2425 }
Victor Stinnere215d962012-10-06 23:03:36 +02002426
2427 if (f[1] == '\0')
2428 writer->overallocate = 0;
2429
2430 switch (*f) {
2431 case 'c':
2432 {
2433 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002434 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2435 PyErr_SetString(PyExc_ValueError,
2436 "character argument not in range(0x110000)");
2437 return NULL;
2438 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002439 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002440 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002441 break;
2442 }
2443
2444 case 'i':
2445 case 'd':
2446 case 'u':
2447 case 'x':
2448 {
2449 /* used by sprintf */
2450 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002451 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002452
2453 if (*f == 'u') {
2454 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2455
2456 if (longflag)
2457 len = sprintf(buffer, fmt,
2458 va_arg(*vargs, unsigned long));
2459#ifdef HAVE_LONG_LONG
2460 else if (longlongflag)
2461 len = sprintf(buffer, fmt,
2462 va_arg(*vargs, unsigned PY_LONG_LONG));
2463#endif
2464 else if (size_tflag)
2465 len = sprintf(buffer, fmt,
2466 va_arg(*vargs, size_t));
2467 else
2468 len = sprintf(buffer, fmt,
2469 va_arg(*vargs, unsigned int));
2470 }
2471 else if (*f == 'x') {
2472 makefmt(fmt, 0, 0, 0, 'x');
2473 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2474 }
2475 else {
2476 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2477
2478 if (longflag)
2479 len = sprintf(buffer, fmt,
2480 va_arg(*vargs, long));
2481#ifdef HAVE_LONG_LONG
2482 else if (longlongflag)
2483 len = sprintf(buffer, fmt,
2484 va_arg(*vargs, PY_LONG_LONG));
2485#endif
2486 else if (size_tflag)
2487 len = sprintf(buffer, fmt,
2488 va_arg(*vargs, Py_ssize_t));
2489 else
2490 len = sprintf(buffer, fmt,
2491 va_arg(*vargs, int));
2492 }
2493 assert(len >= 0);
2494
Victor Stinnere215d962012-10-06 23:03:36 +02002495 if (precision < len)
2496 precision = len;
2497 if (width > precision) {
2498 Py_UCS4 fillchar;
2499 fill = width - precision;
2500 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002501 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2502 return NULL;
2503 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2504 return NULL;
2505 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002506 }
Victor Stinner15a11362012-10-06 23:48:20 +02002507 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002508 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002509 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2510 return NULL;
2511 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2512 return NULL;
2513 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002514 }
Victor Stinner15a11362012-10-06 23:48:20 +02002515 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002516 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002517 break;
2518 }
2519
2520 case 'p':
2521 {
2522 char number[MAX_LONG_LONG_CHARS];
2523
2524 len = sprintf(number, "%p", va_arg(*vargs, void*));
2525 assert(len >= 0);
2526
2527 /* %p is ill-defined: ensure leading 0x. */
2528 if (number[1] == 'X')
2529 number[1] = 'x';
2530 else if (number[1] != 'x') {
2531 memmove(number + 2, number,
2532 strlen(number) + 1);
2533 number[0] = '0';
2534 number[1] = 'x';
2535 len += 2;
2536 }
2537
2538 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2539 return NULL;
2540 break;
2541 }
2542
2543 case 's':
2544 {
2545 /* UTF-8 */
2546 const char *s = va_arg(*vargs, const char*);
2547 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2548 if (!str)
2549 return NULL;
2550 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2551 Py_DECREF(str);
2552 return NULL;
2553 }
2554 Py_DECREF(str);
2555 break;
2556 }
2557
2558 case 'U':
2559 {
2560 PyObject *obj = va_arg(*vargs, PyObject *);
2561 assert(obj && _PyUnicode_CHECK(obj));
2562
2563 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2564 return NULL;
2565 break;
2566 }
2567
2568 case 'V':
2569 {
2570 PyObject *obj = va_arg(*vargs, PyObject *);
2571 const char *str = va_arg(*vargs, const char *);
2572 PyObject *str_obj;
2573 assert(obj || str);
2574 if (obj) {
2575 assert(_PyUnicode_CHECK(obj));
2576 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2577 return NULL;
2578 }
2579 else {
2580 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2581 if (!str_obj)
2582 return NULL;
2583 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2584 Py_DECREF(str_obj);
2585 return NULL;
2586 }
2587 Py_DECREF(str_obj);
2588 }
2589 break;
2590 }
2591
2592 case 'S':
2593 {
2594 PyObject *obj = va_arg(*vargs, PyObject *);
2595 PyObject *str;
2596 assert(obj);
2597 str = PyObject_Str(obj);
2598 if (!str)
2599 return NULL;
2600 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2601 Py_DECREF(str);
2602 return NULL;
2603 }
2604 Py_DECREF(str);
2605 break;
2606 }
2607
2608 case 'R':
2609 {
2610 PyObject *obj = va_arg(*vargs, PyObject *);
2611 PyObject *repr;
2612 assert(obj);
2613 repr = PyObject_Repr(obj);
2614 if (!repr)
2615 return NULL;
2616 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2617 Py_DECREF(repr);
2618 return NULL;
2619 }
2620 Py_DECREF(repr);
2621 break;
2622 }
2623
2624 case 'A':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 PyObject *ascii;
2628 assert(obj);
2629 ascii = PyObject_ASCII(obj);
2630 if (!ascii)
2631 return NULL;
2632 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2633 Py_DECREF(ascii);
2634 return NULL;
2635 }
2636 Py_DECREF(ascii);
2637 break;
2638 }
2639
2640 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002641 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002642 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002643 break;
2644
2645 default:
2646 /* if we stumble upon an unknown formatting code, copy the rest
2647 of the format string to the output string. (we cannot just
2648 skip the code, since there's no way to know what's in the
2649 argument list) */
2650 len = strlen(p);
2651 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2652 return NULL;
2653 f = p+len;
2654 return f;
2655 }
2656
2657 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002658 return f;
2659}
2660
Walter Dörwaldd2034312007-05-18 16:29:38 +00002661PyObject *
2662PyUnicode_FromFormatV(const char *format, va_list vargs)
2663{
Victor Stinnere215d962012-10-06 23:03:36 +02002664 va_list vargs2;
2665 const char *f;
2666 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667
Victor Stinner8f674cc2013-04-17 23:02:17 +02002668 _PyUnicodeWriter_Init(&writer);
2669 writer.min_length = strlen(format) + 100;
2670 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002671
2672 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2673 Copy it to be able to pass a reference to a subfunction. */
2674 Py_VA_COPY(vargs2, vargs);
2675
2676 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002678 f = unicode_fromformat_arg(&writer, f, &vargs2);
2679 if (f == NULL)
2680 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002683 const char *p;
2684 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002685
Victor Stinnere215d962012-10-06 23:03:36 +02002686 p = f;
2687 do
2688 {
2689 if ((unsigned char)*p > 127) {
2690 PyErr_Format(PyExc_ValueError,
2691 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2692 "string, got a non-ASCII byte: 0x%02x",
2693 (unsigned char)*p);
2694 return NULL;
2695 }
2696 p++;
2697 }
2698 while (*p != '\0' && *p != '%');
2699 len = p - f;
2700
2701 if (*p == '\0')
2702 writer.overallocate = 0;
2703 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2704 goto fail;
2705 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2706 writer.pos += len;
2707
2708 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 }
Victor Stinnere215d962012-10-06 23:03:36 +02002711 return _PyUnicodeWriter_Finish(&writer);
2712
2713 fail:
2714 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002715 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716}
2717
Walter Dörwaldd2034312007-05-18 16:29:38 +00002718PyObject *
2719PyUnicode_FromFormat(const char *format, ...)
2720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 PyObject* ret;
2722 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002723
2724#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002726#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002728#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002729 ret = PyUnicode_FromFormatV(format, vargs);
2730 va_end(vargs);
2731 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002732}
2733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734#ifdef HAVE_WCHAR_H
2735
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2737 convert a Unicode object to a wide character string.
2738
Victor Stinnerd88d9832011-09-06 02:00:05 +02002739 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002740 character) required to convert the unicode object. Ignore size argument.
2741
Victor Stinnerd88d9832011-09-06 02:00:05 +02002742 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002744 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002745static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002746unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002747 wchar_t *w,
2748 Py_ssize_t size)
2749{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002750 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 const wchar_t *wstr;
2752
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002753 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002754 if (wstr == NULL)
2755 return -1;
2756
Victor Stinner5593d8a2010-10-02 11:11:27 +00002757 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002758 if (size > res)
2759 size = res + 1;
2760 else
2761 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002762 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002763 return res;
2764 }
2765 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002767}
2768
2769Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002770PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002771 wchar_t *w,
2772 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773{
2774 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002775 PyErr_BadInternalCall();
2776 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002778 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779}
2780
Victor Stinner137c34c2010-09-29 10:25:54 +00002781wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002782PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002783 Py_ssize_t *size)
2784{
2785 wchar_t* buffer;
2786 Py_ssize_t buflen;
2787
2788 if (unicode == NULL) {
2789 PyErr_BadInternalCall();
2790 return NULL;
2791 }
2792
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002793 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002794 if (buflen == -1)
2795 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002796 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002797 PyErr_NoMemory();
2798 return NULL;
2799 }
2800
Victor Stinner137c34c2010-09-29 10:25:54 +00002801 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2802 if (buffer == NULL) {
2803 PyErr_NoMemory();
2804 return NULL;
2805 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002806 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002807 if (buflen == -1) {
2808 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002809 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002810 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002811 if (size != NULL)
2812 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002813 return buffer;
2814}
2815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817
Alexander Belopolsky40018472011-02-26 01:02:56 +00002818PyObject *
2819PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002822 void *data;
2823 int kind;
2824
Victor Stinner8faf8212011-12-08 22:14:11 +01002825 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 PyErr_SetString(PyExc_ValueError,
2827 "chr() arg not in range(0x110000)");
2828 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002829 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002830
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002831 if ((Py_UCS4)ordinal < 256)
2832 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002834 v = PyUnicode_New(1, ordinal);
2835 if (v == NULL)
2836 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002837 kind = PyUnicode_KIND(v);
2838 data = PyUnicode_DATA(v);
2839 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002840 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002841 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002842}
2843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002847 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002848 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002849 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002850 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002851 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 Py_INCREF(obj);
2853 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002854 }
2855 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002856 /* For a Unicode subtype that's not a Unicode object,
2857 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002858 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002860 PyErr_Format(PyExc_TypeError,
2861 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002862 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002863 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002864}
2865
Alexander Belopolsky40018472011-02-26 01:02:56 +00002866PyObject *
2867PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002868 const char *encoding,
2869 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002870{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002871 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002872 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002873
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 PyErr_BadInternalCall();
2876 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002878
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 /* Decoding bytes objects is the most common case and should be fast */
2880 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002881 if (PyBytes_GET_SIZE(obj) == 0)
2882 _Py_RETURN_UNICODE_EMPTY();
2883 v = PyUnicode_Decode(
2884 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2885 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 return v;
2887 }
2888
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002889 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002890 PyErr_SetString(PyExc_TypeError,
2891 "decoding str is not supported");
2892 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002893 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002894
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2896 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2897 PyErr_Format(PyExc_TypeError,
2898 "coercing to str: need bytes, bytearray "
2899 "or buffer-like object, %.80s found",
2900 Py_TYPE(obj)->tp_name);
2901 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002902 }
Tim Petersced69f82003-09-16 20:30:58 +00002903
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002904 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002905 PyBuffer_Release(&buffer);
2906 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002908
Serhiy Storchaka05997252013-01-26 12:14:02 +02002909 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002910 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002911 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912}
2913
Victor Stinner600d3be2010-06-10 12:00:55 +00002914/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002915 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2916 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002917int
2918_Py_normalize_encoding(const char *encoding,
2919 char *lower,
2920 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002922 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002923 char *l;
2924 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002926 if (encoding == NULL) {
2927 strcpy(lower, "utf-8");
2928 return 1;
2929 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002930 e = encoding;
2931 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002932 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002933 while (*e) {
2934 if (l == l_end)
2935 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002936 if (Py_ISUPPER(*e)) {
2937 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002938 }
2939 else if (*e == '_') {
2940 *l++ = '-';
2941 e++;
2942 }
2943 else {
2944 *l++ = *e++;
2945 }
2946 }
2947 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002948 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002949}
2950
Alexander Belopolsky40018472011-02-26 01:02:56 +00002951PyObject *
2952PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002953 Py_ssize_t size,
2954 const char *encoding,
2955 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002956{
2957 PyObject *buffer = NULL, *unicode;
2958 Py_buffer info;
2959 char lower[11]; /* Enough for any encoding shortcut */
2960
Fred Drakee4315f52000-05-09 19:53:39 +00002961 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002962 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002963 if ((strcmp(lower, "utf-8") == 0) ||
2964 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002965 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002966 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002967 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002968 (strcmp(lower, "iso-8859-1") == 0))
2969 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002970#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002971 else if (strcmp(lower, "mbcs") == 0)
2972 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002973#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002974 else if (strcmp(lower, "ascii") == 0)
2975 return PyUnicode_DecodeASCII(s, size, errors);
2976 else if (strcmp(lower, "utf-16") == 0)
2977 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2978 else if (strcmp(lower, "utf-32") == 0)
2979 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981
2982 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002983 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002984 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002985 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002986 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 if (buffer == NULL)
2988 goto onError;
2989 unicode = PyCodec_Decode(buffer, encoding, errors);
2990 if (unicode == NULL)
2991 goto onError;
2992 if (!PyUnicode_Check(unicode)) {
2993 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002994 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002995 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 Py_DECREF(unicode);
2997 goto onError;
2998 }
2999 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003000 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003001
Benjamin Peterson29060642009-01-31 22:14:21 +00003002 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 Py_XDECREF(buffer);
3004 return NULL;
3005}
3006
Alexander Belopolsky40018472011-02-26 01:02:56 +00003007PyObject *
3008PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003009 const char *encoding,
3010 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003011{
3012 PyObject *v;
3013
3014 if (!PyUnicode_Check(unicode)) {
3015 PyErr_BadArgument();
3016 goto onError;
3017 }
3018
3019 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003020 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003021
3022 /* Decode via the codec registry */
3023 v = PyCodec_Decode(unicode, encoding, errors);
3024 if (v == NULL)
3025 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003026 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003027
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003029 return NULL;
3030}
3031
Alexander Belopolsky40018472011-02-26 01:02:56 +00003032PyObject *
3033PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003034 const char *encoding,
3035 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003036{
3037 PyObject *v;
3038
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_BadArgument();
3041 goto onError;
3042 }
3043
3044 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003046
3047 /* Decode via the codec registry */
3048 v = PyCodec_Decode(unicode, encoding, errors);
3049 if (v == NULL)
3050 goto onError;
3051 if (!PyUnicode_Check(v)) {
3052 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003053 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003054 Py_TYPE(v)->tp_name);
3055 Py_DECREF(v);
3056 goto onError;
3057 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003058 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003059
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003061 return NULL;
3062}
3063
Alexander Belopolsky40018472011-02-26 01:02:56 +00003064PyObject *
3065PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003066 Py_ssize_t size,
3067 const char *encoding,
3068 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069{
3070 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003071
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 unicode = PyUnicode_FromUnicode(s, size);
3073 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3076 Py_DECREF(unicode);
3077 return v;
3078}
3079
Alexander Belopolsky40018472011-02-26 01:02:56 +00003080PyObject *
3081PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003082 const char *encoding,
3083 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003084{
3085 PyObject *v;
3086
3087 if (!PyUnicode_Check(unicode)) {
3088 PyErr_BadArgument();
3089 goto onError;
3090 }
3091
3092 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003093 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003094
3095 /* Encode via the codec registry */
3096 v = PyCodec_Encode(unicode, encoding, errors);
3097 if (v == NULL)
3098 goto onError;
3099 return v;
3100
Benjamin Peterson29060642009-01-31 22:14:21 +00003101 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003102 return NULL;
3103}
3104
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003105static size_t
3106wcstombs_errorpos(const wchar_t *wstr)
3107{
3108 size_t len;
3109#if SIZEOF_WCHAR_T == 2
3110 wchar_t buf[3];
3111#else
3112 wchar_t buf[2];
3113#endif
3114 char outbuf[MB_LEN_MAX];
3115 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003116
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003117#if SIZEOF_WCHAR_T == 2
3118 buf[2] = 0;
3119#else
3120 buf[1] = 0;
3121#endif
3122 start = wstr;
3123 while (*wstr != L'\0')
3124 {
3125 previous = wstr;
3126#if SIZEOF_WCHAR_T == 2
3127 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3128 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3129 {
3130 buf[0] = wstr[0];
3131 buf[1] = wstr[1];
3132 wstr += 2;
3133 }
3134 else {
3135 buf[0] = *wstr;
3136 buf[1] = 0;
3137 wstr++;
3138 }
3139#else
3140 buf[0] = *wstr;
3141 wstr++;
3142#endif
3143 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003144 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003145 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003146 }
3147
3148 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003149 return 0;
3150}
3151
Victor Stinner1b579672011-12-17 05:47:23 +01003152static int
3153locale_error_handler(const char *errors, int *surrogateescape)
3154{
3155 if (errors == NULL) {
3156 *surrogateescape = 0;
3157 return 0;
3158 }
3159
3160 if (strcmp(errors, "strict") == 0) {
3161 *surrogateescape = 0;
3162 return 0;
3163 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003164 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003165 *surrogateescape = 1;
3166 return 0;
3167 }
3168 PyErr_Format(PyExc_ValueError,
3169 "only 'strict' and 'surrogateescape' error handlers "
3170 "are supported, not '%s'",
3171 errors);
3172 return -1;
3173}
3174
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003175PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003176PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003177{
3178 Py_ssize_t wlen, wlen2;
3179 wchar_t *wstr;
3180 PyObject *bytes = NULL;
3181 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003182 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003183 PyObject *exc;
3184 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003185 int surrogateescape;
3186
3187 if (locale_error_handler(errors, &surrogateescape) < 0)
3188 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003189
3190 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3191 if (wstr == NULL)
3192 return NULL;
3193
3194 wlen2 = wcslen(wstr);
3195 if (wlen2 != wlen) {
3196 PyMem_Free(wstr);
3197 PyErr_SetString(PyExc_TypeError, "embedded null character");
3198 return NULL;
3199 }
3200
3201 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003202 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003203 char *str;
3204
3205 str = _Py_wchar2char(wstr, &error_pos);
3206 if (str == NULL) {
3207 if (error_pos == (size_t)-1) {
3208 PyErr_NoMemory();
3209 PyMem_Free(wstr);
3210 return NULL;
3211 }
3212 else {
3213 goto encode_error;
3214 }
3215 }
3216 PyMem_Free(wstr);
3217
3218 bytes = PyBytes_FromString(str);
3219 PyMem_Free(str);
3220 }
3221 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003222 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003223 size_t len, len2;
3224
3225 len = wcstombs(NULL, wstr, 0);
3226 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003227 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003228 goto encode_error;
3229 }
3230
3231 bytes = PyBytes_FromStringAndSize(NULL, len);
3232 if (bytes == NULL) {
3233 PyMem_Free(wstr);
3234 return NULL;
3235 }
3236
3237 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3238 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003239 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240 goto encode_error;
3241 }
3242 PyMem_Free(wstr);
3243 }
3244 return bytes;
3245
3246encode_error:
3247 errmsg = strerror(errno);
3248 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003249
3250 if (error_pos == (size_t)-1)
3251 error_pos = wcstombs_errorpos(wstr);
3252
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003253 PyMem_Free(wstr);
3254 Py_XDECREF(bytes);
3255
Victor Stinner2f197072011-12-17 07:08:30 +01003256 if (errmsg != NULL) {
3257 size_t errlen;
3258 wstr = _Py_char2wchar(errmsg, &errlen);
3259 if (wstr != NULL) {
3260 reason = PyUnicode_FromWideChar(wstr, errlen);
3261 PyMem_Free(wstr);
3262 } else
3263 errmsg = NULL;
3264 }
3265 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003266 reason = PyUnicode_FromString(
3267 "wcstombs() encountered an unencodable "
3268 "wide character");
3269 if (reason == NULL)
3270 return NULL;
3271
3272 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3273 "locale", unicode,
3274 (Py_ssize_t)error_pos,
3275 (Py_ssize_t)(error_pos+1),
3276 reason);
3277 Py_DECREF(reason);
3278 if (exc != NULL) {
3279 PyCodec_StrictErrors(exc);
3280 Py_XDECREF(exc);
3281 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003282 return NULL;
3283}
3284
Victor Stinnerad158722010-10-27 00:25:46 +00003285PyObject *
3286PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003287{
Victor Stinner99b95382011-07-04 14:23:54 +02003288#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003289 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003290#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003291 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003292#else
Victor Stinner793b5312011-04-27 00:24:21 +02003293 PyInterpreterState *interp = PyThreadState_GET()->interp;
3294 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3295 cannot use it to encode and decode filenames before it is loaded. Load
3296 the Python codec requires to encode at least its own filename. Use the C
3297 version of the locale codec until the codec registry is initialized and
3298 the Python codec is loaded.
3299
3300 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3301 cannot only rely on it: check also interp->fscodec_initialized for
3302 subinterpreters. */
3303 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003304 return PyUnicode_AsEncodedString(unicode,
3305 Py_FileSystemDefaultEncoding,
3306 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003307 }
3308 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003309 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003310 }
Victor Stinnerad158722010-10-27 00:25:46 +00003311#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003312}
3313
Alexander Belopolsky40018472011-02-26 01:02:56 +00003314PyObject *
3315PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003316 const char *encoding,
3317 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318{
3319 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003320 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003321
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322 if (!PyUnicode_Check(unicode)) {
3323 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 }
Fred Drakee4315f52000-05-09 19:53:39 +00003326
Fred Drakee4315f52000-05-09 19:53:39 +00003327 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003328 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003329 if ((strcmp(lower, "utf-8") == 0) ||
3330 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003331 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003332 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003333 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003334 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003335 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003336 }
Victor Stinner37296e82010-06-10 13:36:23 +00003337 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003338 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003339 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003340 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003341#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003342 else if (strcmp(lower, "mbcs") == 0)
3343 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003344#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003345 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003346 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348
3349 /* Encode via the codec registry */
3350 v = PyCodec_Encode(unicode, encoding, errors);
3351 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003352 return NULL;
3353
3354 /* The normal path */
3355 if (PyBytes_Check(v))
3356 return v;
3357
3358 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003359 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003360 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003361 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003362
3363 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3364 "encoder %s returned bytearray instead of bytes",
3365 encoding);
3366 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003367 Py_DECREF(v);
3368 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003369 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003370
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003371 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3372 Py_DECREF(v);
3373 return b;
3374 }
3375
3376 PyErr_Format(PyExc_TypeError,
3377 "encoder did not return a bytes object (type=%.400s)",
3378 Py_TYPE(v)->tp_name);
3379 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003380 return NULL;
3381}
3382
Alexander Belopolsky40018472011-02-26 01:02:56 +00003383PyObject *
3384PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003385 const char *encoding,
3386 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003387{
3388 PyObject *v;
3389
3390 if (!PyUnicode_Check(unicode)) {
3391 PyErr_BadArgument();
3392 goto onError;
3393 }
3394
3395 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003397
3398 /* Encode via the codec registry */
3399 v = PyCodec_Encode(unicode, encoding, errors);
3400 if (v == NULL)
3401 goto onError;
3402 if (!PyUnicode_Check(v)) {
3403 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003404 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003405 Py_TYPE(v)->tp_name);
3406 Py_DECREF(v);
3407 goto onError;
3408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003410
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 return NULL;
3413}
3414
Victor Stinner2f197072011-12-17 07:08:30 +01003415static size_t
3416mbstowcs_errorpos(const char *str, size_t len)
3417{
3418#ifdef HAVE_MBRTOWC
3419 const char *start = str;
3420 mbstate_t mbs;
3421 size_t converted;
3422 wchar_t ch;
3423
3424 memset(&mbs, 0, sizeof mbs);
3425 while (len)
3426 {
3427 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3428 if (converted == 0)
3429 /* Reached end of string */
3430 break;
3431 if (converted == (size_t)-1 || converted == (size_t)-2) {
3432 /* Conversion error or incomplete character */
3433 return str - start;
3434 }
3435 else {
3436 str += converted;
3437 len -= converted;
3438 }
3439 }
3440 /* failed to find the undecodable byte sequence */
3441 return 0;
3442#endif
3443 return 0;
3444}
3445
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003446PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003447PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003448 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003449{
3450 wchar_t smallbuf[256];
3451 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3452 wchar_t *wstr;
3453 size_t wlen, wlen2;
3454 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003455 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003456 size_t error_pos;
3457 char *errmsg;
3458 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003459
3460 if (locale_error_handler(errors, &surrogateescape) < 0)
3461 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003462
3463 if (str[len] != '\0' || len != strlen(str)) {
3464 PyErr_SetString(PyExc_TypeError, "embedded null character");
3465 return NULL;
3466 }
3467
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003468 if (surrogateescape) {
3469 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003470 wstr = _Py_char2wchar(str, &wlen);
3471 if (wstr == NULL) {
3472 if (wlen == (size_t)-1)
3473 PyErr_NoMemory();
3474 else
3475 PyErr_SetFromErrno(PyExc_OSError);
3476 return NULL;
3477 }
3478
3479 unicode = PyUnicode_FromWideChar(wstr, wlen);
3480 PyMem_Free(wstr);
3481 }
3482 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003483 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003484#ifndef HAVE_BROKEN_MBSTOWCS
3485 wlen = mbstowcs(NULL, str, 0);
3486#else
3487 wlen = len;
3488#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003489 if (wlen == (size_t)-1)
3490 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003491 if (wlen+1 <= smallbuf_len) {
3492 wstr = smallbuf;
3493 }
3494 else {
3495 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3496 return PyErr_NoMemory();
3497
3498 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3499 if (!wstr)
3500 return PyErr_NoMemory();
3501 }
3502
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003503 wlen2 = mbstowcs(wstr, str, wlen+1);
3504 if (wlen2 == (size_t)-1) {
3505 if (wstr != smallbuf)
3506 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003507 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003508 }
3509#ifdef HAVE_BROKEN_MBSTOWCS
3510 assert(wlen2 == wlen);
3511#endif
3512 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3513 if (wstr != smallbuf)
3514 PyMem_Free(wstr);
3515 }
3516 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003517
3518decode_error:
3519 errmsg = strerror(errno);
3520 assert(errmsg != NULL);
3521
3522 error_pos = mbstowcs_errorpos(str, len);
3523 if (errmsg != NULL) {
3524 size_t errlen;
3525 wstr = _Py_char2wchar(errmsg, &errlen);
3526 if (wstr != NULL) {
3527 reason = PyUnicode_FromWideChar(wstr, errlen);
3528 PyMem_Free(wstr);
3529 } else
3530 errmsg = NULL;
3531 }
3532 if (errmsg == NULL)
3533 reason = PyUnicode_FromString(
3534 "mbstowcs() encountered an invalid multibyte sequence");
3535 if (reason == NULL)
3536 return NULL;
3537
3538 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3539 "locale", str, len,
3540 (Py_ssize_t)error_pos,
3541 (Py_ssize_t)(error_pos+1),
3542 reason);
3543 Py_DECREF(reason);
3544 if (exc != NULL) {
3545 PyCodec_StrictErrors(exc);
3546 Py_XDECREF(exc);
3547 }
3548 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003549}
3550
3551PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003552PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003553{
3554 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003555 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003556}
3557
3558
3559PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003560PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003561 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003562 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3563}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003564
Christian Heimes5894ba72007-11-04 11:43:14 +00003565PyObject*
3566PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3567{
Victor Stinner99b95382011-07-04 14:23:54 +02003568#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003569 return PyUnicode_DecodeMBCS(s, size, NULL);
3570#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003571 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003572#else
Victor Stinner793b5312011-04-27 00:24:21 +02003573 PyInterpreterState *interp = PyThreadState_GET()->interp;
3574 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3575 cannot use it to encode and decode filenames before it is loaded. Load
3576 the Python codec requires to encode at least its own filename. Use the C
3577 version of the locale codec until the codec registry is initialized and
3578 the Python codec is loaded.
3579
3580 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3581 cannot only rely on it: check also interp->fscodec_initialized for
3582 subinterpreters. */
3583 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003584 return PyUnicode_Decode(s, size,
3585 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003586 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003587 }
3588 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003589 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003590 }
Victor Stinnerad158722010-10-27 00:25:46 +00003591#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003592}
3593
Martin v. Löwis011e8422009-05-05 04:43:17 +00003594
3595int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003596_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003597{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003598 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003599
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003600 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003601 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003602 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3603 PyUnicode_GET_LENGTH(str), '\0', 1);
3604 if (pos == -1)
3605 return 0;
3606 else
3607 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003608}
3609
Antoine Pitrou13348842012-01-29 18:36:34 +01003610int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003611PyUnicode_FSConverter(PyObject* arg, void* addr)
3612{
3613 PyObject *output = NULL;
3614 Py_ssize_t size;
3615 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003616 if (arg == NULL) {
3617 Py_DECREF(*(PyObject**)addr);
3618 return 1;
3619 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003620 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003621 output = arg;
3622 Py_INCREF(output);
3623 }
3624 else {
3625 arg = PyUnicode_FromObject(arg);
3626 if (!arg)
3627 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003628 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003629 Py_DECREF(arg);
3630 if (!output)
3631 return 0;
3632 if (!PyBytes_Check(output)) {
3633 Py_DECREF(output);
3634 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3635 return 0;
3636 }
3637 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003638 size = PyBytes_GET_SIZE(output);
3639 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003640 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003641 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003642 Py_DECREF(output);
3643 return 0;
3644 }
3645 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003646 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003647}
3648
3649
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003650int
3651PyUnicode_FSDecoder(PyObject* arg, void* addr)
3652{
3653 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003654 if (arg == NULL) {
3655 Py_DECREF(*(PyObject**)addr);
3656 return 1;
3657 }
3658 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003659 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003660 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003661 output = arg;
3662 Py_INCREF(output);
3663 }
3664 else {
3665 arg = PyBytes_FromObject(arg);
3666 if (!arg)
3667 return 0;
3668 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3669 PyBytes_GET_SIZE(arg));
3670 Py_DECREF(arg);
3671 if (!output)
3672 return 0;
3673 if (!PyUnicode_Check(output)) {
3674 Py_DECREF(output);
3675 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3676 return 0;
3677 }
3678 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003679 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003680 Py_DECREF(output);
3681 return 0;
3682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003684 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003685 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3686 Py_DECREF(output);
3687 return 0;
3688 }
3689 *(PyObject**)addr = output;
3690 return Py_CLEANUP_SUPPORTED;
3691}
3692
3693
Martin v. Löwis5b222132007-06-10 09:51:05 +00003694char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003695PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003696{
Christian Heimesf3863112007-11-22 07:46:41 +00003697 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003698
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003699 if (!PyUnicode_Check(unicode)) {
3700 PyErr_BadArgument();
3701 return NULL;
3702 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003703 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003704 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003705
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003706 if (PyUnicode_UTF8(unicode) == NULL) {
3707 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003708 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3709 if (bytes == NULL)
3710 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003711 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3712 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 Py_DECREF(bytes);
3714 return NULL;
3715 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003716 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3717 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3718 PyBytes_AS_STRING(bytes),
3719 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720 Py_DECREF(bytes);
3721 }
3722
3723 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003724 *psize = PyUnicode_UTF8_LENGTH(unicode);
3725 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003726}
3727
3728char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003730{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3732}
3733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734Py_UNICODE *
3735PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 const unsigned char *one_byte;
3738#if SIZEOF_WCHAR_T == 4
3739 const Py_UCS2 *two_bytes;
3740#else
3741 const Py_UCS4 *four_bytes;
3742 const Py_UCS4 *ucs4_end;
3743 Py_ssize_t num_surrogates;
3744#endif
3745 wchar_t *w;
3746 wchar_t *wchar_end;
3747
3748 if (!PyUnicode_Check(unicode)) {
3749 PyErr_BadArgument();
3750 return NULL;
3751 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003754 assert(_PyUnicode_KIND(unicode) != 0);
3755 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3760 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 num_surrogates = 0;
3762
3763 for (; four_bytes < ucs4_end; ++four_bytes) {
3764 if (*four_bytes > 0xFFFF)
3765 ++num_surrogates;
3766 }
3767
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3769 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3770 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 PyErr_NoMemory();
3772 return NULL;
3773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 w = _PyUnicode_WSTR(unicode);
3777 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3778 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3780 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003781 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003783 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3784 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 }
3786 else
3787 *w = *four_bytes;
3788
3789 if (w > wchar_end) {
3790 assert(0 && "Miscalculated string end");
3791 }
3792 }
3793 *w = 0;
3794#else
3795 /* sizeof(wchar_t) == 4 */
3796 Py_FatalError("Impossible unicode object state, wstr and str "
3797 "should share memory already.");
3798 return NULL;
3799#endif
3800 }
3801 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003802 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3803 (_PyUnicode_LENGTH(unicode) + 1));
3804 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805 PyErr_NoMemory();
3806 return NULL;
3807 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003808 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3809 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3810 w = _PyUnicode_WSTR(unicode);
3811 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3814 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 for (; w < wchar_end; ++one_byte, ++w)
3816 *w = *one_byte;
3817 /* null-terminate the wstr */
3818 *w = 0;
3819 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003820 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003822 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823 for (; w < wchar_end; ++two_bytes, ++w)
3824 *w = *two_bytes;
3825 /* null-terminate the wstr */
3826 *w = 0;
3827#else
3828 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003829 PyObject_FREE(_PyUnicode_WSTR(unicode));
3830 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831 Py_FatalError("Impossible unicode object state, wstr "
3832 "and str should share memory already.");
3833 return NULL;
3834#endif
3835 }
3836 else {
3837 assert(0 && "This should never happen.");
3838 }
3839 }
3840 }
3841 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003842 *size = PyUnicode_WSTR_LENGTH(unicode);
3843 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003844}
3845
Alexander Belopolsky40018472011-02-26 01:02:56 +00003846Py_UNICODE *
3847PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850}
3851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852
Alexander Belopolsky40018472011-02-26 01:02:56 +00003853Py_ssize_t
3854PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855{
3856 if (!PyUnicode_Check(unicode)) {
3857 PyErr_BadArgument();
3858 goto onError;
3859 }
3860 return PyUnicode_GET_SIZE(unicode);
3861
Benjamin Peterson29060642009-01-31 22:14:21 +00003862 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 return -1;
3864}
3865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866Py_ssize_t
3867PyUnicode_GetLength(PyObject *unicode)
3868{
Victor Stinner07621332012-06-16 04:53:46 +02003869 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870 PyErr_BadArgument();
3871 return -1;
3872 }
Victor Stinner07621332012-06-16 04:53:46 +02003873 if (PyUnicode_READY(unicode) == -1)
3874 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 return PyUnicode_GET_LENGTH(unicode);
3876}
3877
3878Py_UCS4
3879PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3880{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003881 void *data;
3882 int kind;
3883
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003884 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3885 PyErr_BadArgument();
3886 return (Py_UCS4)-1;
3887 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003888 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003889 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 return (Py_UCS4)-1;
3891 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003892 data = PyUnicode_DATA(unicode);
3893 kind = PyUnicode_KIND(unicode);
3894 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895}
3896
3897int
3898PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3899{
3900 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003901 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003902 return -1;
3903 }
Victor Stinner488fa492011-12-12 00:01:39 +01003904 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003905 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003906 PyErr_SetString(PyExc_IndexError, "string index out of range");
3907 return -1;
3908 }
Victor Stinner488fa492011-12-12 00:01:39 +01003909 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003910 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003911 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3912 PyErr_SetString(PyExc_ValueError, "character out of range");
3913 return -1;
3914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3916 index, ch);
3917 return 0;
3918}
3919
Alexander Belopolsky40018472011-02-26 01:02:56 +00003920const char *
3921PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003922{
Victor Stinner42cb4622010-09-01 19:39:01 +00003923 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003924}
3925
Victor Stinner554f3f02010-06-16 23:33:54 +00003926/* create or adjust a UnicodeDecodeError */
3927static void
3928make_decode_exception(PyObject **exceptionObject,
3929 const char *encoding,
3930 const char *input, Py_ssize_t length,
3931 Py_ssize_t startpos, Py_ssize_t endpos,
3932 const char *reason)
3933{
3934 if (*exceptionObject == NULL) {
3935 *exceptionObject = PyUnicodeDecodeError_Create(
3936 encoding, input, length, startpos, endpos, reason);
3937 }
3938 else {
3939 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3940 goto onError;
3941 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3942 goto onError;
3943 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3944 goto onError;
3945 }
3946 return;
3947
3948onError:
3949 Py_DECREF(*exceptionObject);
3950 *exceptionObject = NULL;
3951}
3952
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003953#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954/* error handling callback helper:
3955 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003956 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 and adjust various state variables.
3958 return 0 on success, -1 on error
3959*/
3960
Alexander Belopolsky40018472011-02-26 01:02:56 +00003961static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003962unicode_decode_call_errorhandler_wchar(
3963 const char *errors, PyObject **errorHandler,
3964 const char *encoding, const char *reason,
3965 const char **input, const char **inend, Py_ssize_t *startinpos,
3966 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3967 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003969 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970
3971 PyObject *restuple = NULL;
3972 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003973 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003974 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003975 Py_ssize_t requiredsize;
3976 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003977 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003978 wchar_t *repwstr;
3979 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003981 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3982 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003983
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003985 *errorHandler = PyCodec_LookupError(errors);
3986 if (*errorHandler == NULL)
3987 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 }
3989
Victor Stinner554f3f02010-06-16 23:33:54 +00003990 make_decode_exception(exceptionObject,
3991 encoding,
3992 *input, *inend - *input,
3993 *startinpos, *endinpos,
3994 reason);
3995 if (*exceptionObject == NULL)
3996 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997
3998 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3999 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004002 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 }
4005 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004007
4008 /* Copy back the bytes variables, which might have been modified by the
4009 callback */
4010 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4011 if (!inputobj)
4012 goto onError;
4013 if (!PyBytes_Check(inputobj)) {
4014 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4015 }
4016 *input = PyBytes_AS_STRING(inputobj);
4017 insize = PyBytes_GET_SIZE(inputobj);
4018 *inend = *input + insize;
4019 /* we can DECREF safely, as the exception has another reference,
4020 so the object won't go away. */
4021 Py_DECREF(inputobj);
4022
4023 if (newpos<0)
4024 newpos = insize+newpos;
4025 if (newpos<0 || newpos>insize) {
4026 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4027 goto onError;
4028 }
4029
4030 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4031 if (repwstr == NULL)
4032 goto onError;
4033 /* need more space? (at least enough for what we
4034 have+the replacement+the rest of the string (starting
4035 at the new input position), so we won't have to check space
4036 when there are no errors in the rest of the string) */
4037 requiredsize = *outpos + repwlen + insize-newpos;
4038 if (requiredsize > outsize) {
4039 if (requiredsize < 2*outsize)
4040 requiredsize = 2*outsize;
4041 if (unicode_resize(output, requiredsize) < 0)
4042 goto onError;
4043 }
4044 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4045 *outpos += repwlen;
4046
4047 *endinpos = newpos;
4048 *inptr = *input + newpos;
4049
4050 /* we made it! */
4051 Py_XDECREF(restuple);
4052 return 0;
4053
4054 onError:
4055 Py_XDECREF(restuple);
4056 return -1;
4057}
4058#endif /* HAVE_MBCS */
4059
4060static int
4061unicode_decode_call_errorhandler_writer(
4062 const char *errors, PyObject **errorHandler,
4063 const char *encoding, const char *reason,
4064 const char **input, const char **inend, Py_ssize_t *startinpos,
4065 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4066 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4067{
4068 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4069
4070 PyObject *restuple = NULL;
4071 PyObject *repunicode = NULL;
4072 Py_ssize_t insize;
4073 Py_ssize_t newpos;
4074 PyObject *inputobj = NULL;
4075
4076 if (*errorHandler == NULL) {
4077 *errorHandler = PyCodec_LookupError(errors);
4078 if (*errorHandler == NULL)
4079 goto onError;
4080 }
4081
4082 make_decode_exception(exceptionObject,
4083 encoding,
4084 *input, *inend - *input,
4085 *startinpos, *endinpos,
4086 reason);
4087 if (*exceptionObject == NULL)
4088 goto onError;
4089
4090 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4091 if (restuple == NULL)
4092 goto onError;
4093 if (!PyTuple_Check(restuple)) {
4094 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4095 goto onError;
4096 }
4097 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004098 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004099
4100 /* Copy back the bytes variables, which might have been modified by the
4101 callback */
4102 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4103 if (!inputobj)
4104 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004105 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004107 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004108 *input = PyBytes_AS_STRING(inputobj);
4109 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004110 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004111 /* we can DECREF safely, as the exception has another reference,
4112 so the object won't go away. */
4113 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004117 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4119 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004120 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121
Victor Stinner8f674cc2013-04-17 23:02:17 +02004122 if (PyUnicode_READY(repunicode) < 0)
4123 goto onError;
4124 if (PyUnicode_GET_LENGTH(repunicode) > 1)
4125 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004126 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004127 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004130 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133 Py_XDECREF(restuple);
4134 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004138 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139}
4140
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004141/* --- UTF-7 Codec -------------------------------------------------------- */
4142
Antoine Pitrou244651a2009-05-04 18:56:13 +00004143/* See RFC2152 for details. We encode conservatively and decode liberally. */
4144
4145/* Three simple macros defining base-64. */
4146
4147/* Is c a base-64 character? */
4148
4149#define IS_BASE64(c) \
4150 (((c) >= 'A' && (c) <= 'Z') || \
4151 ((c) >= 'a' && (c) <= 'z') || \
4152 ((c) >= '0' && (c) <= '9') || \
4153 (c) == '+' || (c) == '/')
4154
4155/* given that c is a base-64 character, what is its base-64 value? */
4156
4157#define FROM_BASE64(c) \
4158 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4159 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4160 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4161 (c) == '+' ? 62 : 63)
4162
4163/* What is the base-64 character of the bottom 6 bits of n? */
4164
4165#define TO_BASE64(n) \
4166 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4167
4168/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4169 * decoded as itself. We are permissive on decoding; the only ASCII
4170 * byte not decoding to itself is the + which begins a base64
4171 * string. */
4172
4173#define DECODE_DIRECT(c) \
4174 ((c) <= 127 && (c) != '+')
4175
4176/* The UTF-7 encoder treats ASCII characters differently according to
4177 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4178 * the above). See RFC2152. This array identifies these different
4179 * sets:
4180 * 0 : "Set D"
4181 * alphanumeric and '(),-./:?
4182 * 1 : "Set O"
4183 * !"#$%&*;<=>@[]^_`{|}
4184 * 2 : "whitespace"
4185 * ht nl cr sp
4186 * 3 : special (must be base64 encoded)
4187 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4188 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004189
Tim Petersced69f82003-09-16 20:30:58 +00004190static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004191char utf7_category[128] = {
4192/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4193 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4194/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4195 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4196/* sp ! " # $ % & ' ( ) * + , - . / */
4197 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4198/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4200/* @ A B C D E F G H I J K L M N O */
4201 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4202/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4204/* ` a b c d e f g h i j k l m n o */
4205 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4206/* p q r s t u v w x y z { | } ~ del */
4207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004208};
4209
Antoine Pitrou244651a2009-05-04 18:56:13 +00004210/* ENCODE_DIRECT: this character should be encoded as itself. The
4211 * answer depends on whether we are encoding set O as itself, and also
4212 * on whether we are encoding whitespace as itself. RFC2152 makes it
4213 * clear that the answers to these questions vary between
4214 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004215
Antoine Pitrou244651a2009-05-04 18:56:13 +00004216#define ENCODE_DIRECT(c, directO, directWS) \
4217 ((c) < 128 && (c) > 0 && \
4218 ((utf7_category[(c)] == 0) || \
4219 (directWS && (utf7_category[(c)] == 2)) || \
4220 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004221
Alexander Belopolsky40018472011-02-26 01:02:56 +00004222PyObject *
4223PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004224 Py_ssize_t size,
4225 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004226{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004227 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4228}
4229
Antoine Pitrou244651a2009-05-04 18:56:13 +00004230/* The decoder. The only state we preserve is our read position,
4231 * i.e. how many characters we have consumed. So if we end in the
4232 * middle of a shift sequence we have to back off the read position
4233 * and the output to the beginning of the sequence, otherwise we lose
4234 * all the shift state (seen bits, number of bits seen, high
4235 * surrogate). */
4236
Alexander Belopolsky40018472011-02-26 01:02:56 +00004237PyObject *
4238PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004239 Py_ssize_t size,
4240 const char *errors,
4241 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004242{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004244 Py_ssize_t startinpos;
4245 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004246 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004247 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004248 const char *errmsg = "";
4249 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004250 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004251 unsigned int base64bits = 0;
4252 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004253 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 PyObject *errorHandler = NULL;
4255 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004257 if (size == 0) {
4258 if (consumed)
4259 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004260 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004261 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004262
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004263 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004264 _PyUnicodeWriter_Init(&writer);
4265 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266
4267 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 e = s + size;
4269
4270 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004271 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004272 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004273 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004274
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275 if (inShift) { /* in a base-64 section */
4276 if (IS_BASE64(ch)) { /* consume a base-64 character */
4277 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4278 base64bits += 6;
4279 s++;
4280 if (base64bits >= 16) {
4281 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004282 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004283 base64bits -= 16;
4284 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4285 if (surrogate) {
4286 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004287 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4288 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004289 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004290 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004291 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004292 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004293 }
4294 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004295 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004296 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004298 }
4299 }
Victor Stinner551ac952011-11-29 22:58:13 +01004300 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301 /* first surrogate */
4302 surrogate = outCh;
4303 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004304 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004305 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004306 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307 }
4308 }
4309 }
4310 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004311 inShift = 0;
4312 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004314 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004315 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004316 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004317 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 if (base64bits > 0) { /* left-over bits */
4319 if (base64bits >= 6) {
4320 /* We've seen at least one base-64 character */
4321 errmsg = "partial character in shift sequence";
4322 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 else {
4325 /* Some bits remain; they should be zero */
4326 if (base64buffer != 0) {
4327 errmsg = "non-zero padding bits in shift sequence";
4328 goto utf7Error;
4329 }
4330 }
4331 }
4332 if (ch != '-') {
4333 /* '-' is absorbed; other terminating
4334 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004335 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004336 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004337 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338 }
4339 }
4340 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 s++; /* consume '+' */
4343 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004344 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004345 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004346 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 }
4348 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004352 }
4353 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004356 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004357 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004358 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 else {
4360 startinpos = s-starts;
4361 s++;
4362 errmsg = "unexpected special character";
4363 goto utf7Error;
4364 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004368 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 errors, &errorHandler,
4370 "utf7", errmsg,
4371 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004372 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 }
4375
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 /* end of string */
4377
4378 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4379 /* if we're in an inconsistent state, that's an error */
4380 if (surrogate ||
4381 (base64bits >= 6) ||
4382 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004384 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 errors, &errorHandler,
4386 "utf7", "unterminated shift sequence",
4387 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 goto onError;
4390 if (s < e)
4391 goto restart;
4392 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394
4395 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004396 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004398 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004399 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 }
4401 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004402 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004404 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 Py_XDECREF(errorHandler);
4407 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 Py_XDECREF(errorHandler);
4412 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 return NULL;
4415}
4416
4417
Alexander Belopolsky40018472011-02-26 01:02:56 +00004418PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004419_PyUnicode_EncodeUTF7(PyObject *str,
4420 int base64SetO,
4421 int base64WhiteSpace,
4422 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004424 int kind;
4425 void *data;
4426 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004427 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004429 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 unsigned int base64bits = 0;
4431 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 char * out;
4433 char * start;
4434
Benjamin Petersonbac79492012-01-14 13:34:47 -05004435 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004436 return NULL;
4437 kind = PyUnicode_KIND(str);
4438 data = PyUnicode_DATA(str);
4439 len = PyUnicode_GET_LENGTH(str);
4440
4441 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004444 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004445 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004446 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004447 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448 if (v == NULL)
4449 return NULL;
4450
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004451 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004452 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004453 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 if (inShift) {
4456 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4457 /* shifting out */
4458 if (base64bits) { /* output remaining bits */
4459 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4460 base64buffer = 0;
4461 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 }
4463 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 /* Characters not in the BASE64 set implicitly unshift the sequence
4465 so no '-' is required, except if the character is itself a '-' */
4466 if (IS_BASE64(ch) || ch == '-') {
4467 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 *out++ = (char) ch;
4470 }
4471 else {
4472 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004473 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475 else { /* not in a shift sequence */
4476 if (ch == '+') {
4477 *out++ = '+';
4478 *out++ = '-';
4479 }
4480 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4481 *out++ = (char) ch;
4482 }
4483 else {
4484 *out++ = '+';
4485 inShift = 1;
4486 goto encode_char;
4487 }
4488 }
4489 continue;
4490encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004492 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004493
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494 /* code first surrogate */
4495 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004496 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 while (base64bits >= 6) {
4498 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4499 base64bits -= 6;
4500 }
4501 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004502 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004503 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504 base64bits += 16;
4505 base64buffer = (base64buffer << 16) | ch;
4506 while (base64bits >= 6) {
4507 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4508 base64bits -= 6;
4509 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004510 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 if (base64bits)
4512 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4513 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004515 if (_PyBytes_Resize(&v, out - start) < 0)
4516 return NULL;
4517 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004519PyObject *
4520PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4521 Py_ssize_t size,
4522 int base64SetO,
4523 int base64WhiteSpace,
4524 const char *errors)
4525{
4526 PyObject *result;
4527 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4528 if (tmp == NULL)
4529 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004530 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004531 base64WhiteSpace, errors);
4532 Py_DECREF(tmp);
4533 return result;
4534}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536#undef IS_BASE64
4537#undef FROM_BASE64
4538#undef TO_BASE64
4539#undef DECODE_DIRECT
4540#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542/* --- UTF-8 Codec -------------------------------------------------------- */
4543
Alexander Belopolsky40018472011-02-26 01:02:56 +00004544PyObject *
4545PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004546 Py_ssize_t size,
4547 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548{
Walter Dörwald69652032004-09-07 20:24:22 +00004549 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4550}
4551
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004552#include "stringlib/asciilib.h"
4553#include "stringlib/codecs.h"
4554#include "stringlib/undef.h"
4555
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004556#include "stringlib/ucs1lib.h"
4557#include "stringlib/codecs.h"
4558#include "stringlib/undef.h"
4559
4560#include "stringlib/ucs2lib.h"
4561#include "stringlib/codecs.h"
4562#include "stringlib/undef.h"
4563
4564#include "stringlib/ucs4lib.h"
4565#include "stringlib/codecs.h"
4566#include "stringlib/undef.h"
4567
Antoine Pitrouab868312009-01-10 15:40:25 +00004568/* Mask to quickly check whether a C 'long' contains a
4569 non-ASCII, UTF8-encoded char. */
4570#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004571# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004572#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004573# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004574#else
4575# error C 'long' size should be either 4 or 8!
4576#endif
4577
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004578static Py_ssize_t
4579ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004580{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004581 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004582 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004584#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004585 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4586 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004587 /* Fast path, see in STRINGLIB(utf8_decode) for
4588 an explanation. */
4589 /* Help register allocation */
4590 register const char *_p = p;
4591 register Py_UCS1 * q = dest;
4592 while (_p < aligned_end) {
4593 unsigned long value = *(const unsigned long *) _p;
4594 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004596 *((unsigned long *)q) = value;
4597 _p += SIZEOF_LONG;
4598 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004599 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004600 p = _p;
4601 while (p < end) {
4602 if ((unsigned char)*p & 0x80)
4603 break;
4604 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004606 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004608#endif
4609 while (p < end) {
4610 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4611 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004612 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004613 /* Help register allocation */
4614 register const char *_p = p;
4615 while (_p < aligned_end) {
4616 unsigned long value = *(unsigned long *) _p;
4617 if (value & ASCII_CHAR_MASK)
4618 break;
4619 _p += SIZEOF_LONG;
4620 }
4621 p = _p;
4622 if (_p == end)
4623 break;
4624 }
4625 if ((unsigned char)*p & 0x80)
4626 break;
4627 ++p;
4628 }
4629 memcpy(dest, start, p - start);
4630 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631}
Antoine Pitrouab868312009-01-10 15:40:25 +00004632
Victor Stinner785938e2011-12-11 20:09:03 +01004633PyObject *
4634PyUnicode_DecodeUTF8Stateful(const char *s,
4635 Py_ssize_t size,
4636 const char *errors,
4637 Py_ssize_t *consumed)
4638{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004639 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004640 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004641 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004642
4643 Py_ssize_t startinpos;
4644 Py_ssize_t endinpos;
4645 const char *errmsg = "";
4646 PyObject *errorHandler = NULL;
4647 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004648
4649 if (size == 0) {
4650 if (consumed)
4651 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004652 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004653 }
4654
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004655 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4656 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004657 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658 *consumed = 1;
4659 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004660 }
4661
Victor Stinner8f674cc2013-04-17 23:02:17 +02004662 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004663 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4664 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004665
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004666 writer.pos = ascii_decode(s, end, writer.data);
4667 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004668 while (s < end) {
4669 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004672 if (PyUnicode_IS_ASCII(writer.buffer))
4673 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004675 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004676 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004677 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678 } else {
4679 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 }
4682
4683 switch (ch) {
4684 case 0:
4685 if (s == end || consumed)
4686 goto End;
4687 errmsg = "unexpected end of data";
4688 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004689 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004690 break;
4691 case 1:
4692 errmsg = "invalid start byte";
4693 startinpos = s - starts;
4694 endinpos = startinpos + 1;
4695 break;
4696 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004697 case 3:
4698 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699 errmsg = "invalid continuation byte";
4700 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004701 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 break;
4703 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004704 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004705 goto onError;
4706 continue;
4707 }
4708
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004709 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004710 errors, &errorHandler,
4711 "utf-8", errmsg,
4712 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004713 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004715 }
4716
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004717End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 if (consumed)
4719 *consumed = s - starts;
4720
4721 Py_XDECREF(errorHandler);
4722 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724
4725onError:
4726 Py_XDECREF(errorHandler);
4727 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004728 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004730}
4731
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004732#ifdef __APPLE__
4733
4734/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004735 used to decode the command line arguments on Mac OS X.
4736
4737 Return a pointer to a newly allocated wide character string (use
4738 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004739
4740wchar_t*
4741_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4742{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004743 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004744 wchar_t *unicode;
4745 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004746
4747 /* Note: size will always be longer than the resulting Unicode
4748 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004749 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004750 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004751 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4752 if (!unicode)
4753 return NULL;
4754
4755 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004756 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004758 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004759 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004760#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004762#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004763 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004764#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 if (ch > 0xFF) {
4766#if SIZEOF_WCHAR_T == 4
4767 assert(0);
4768#else
4769 assert(Py_UNICODE_IS_SURROGATE(ch));
4770 /* compute and append the two surrogates: */
4771 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4772 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4773#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004774 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 else {
4776 if (!ch && s == e)
4777 break;
4778 /* surrogateescape */
4779 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4780 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004781 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004783 return unicode;
4784}
4785
4786#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788/* Primary internal function which creates utf8 encoded bytes objects.
4789
4790 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004791 and allocate exactly as much space needed at the end. Else allocate the
4792 maximum possible needed (4 result bytes per Unicode character), and return
4793 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004794*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004795PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004796_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797{
Victor Stinner6099a032011-12-18 14:22:26 +01004798 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004799 void *data;
4800 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004802 if (!PyUnicode_Check(unicode)) {
4803 PyErr_BadArgument();
4804 return NULL;
4805 }
4806
4807 if (PyUnicode_READY(unicode) == -1)
4808 return NULL;
4809
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004810 if (PyUnicode_UTF8(unicode))
4811 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4812 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813
4814 kind = PyUnicode_KIND(unicode);
4815 data = PyUnicode_DATA(unicode);
4816 size = PyUnicode_GET_LENGTH(unicode);
4817
Benjamin Petersonead6b532011-12-20 17:23:42 -06004818 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004819 default:
4820 assert(0);
4821 case PyUnicode_1BYTE_KIND:
4822 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4823 assert(!PyUnicode_IS_ASCII(unicode));
4824 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4825 case PyUnicode_2BYTE_KIND:
4826 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4827 case PyUnicode_4BYTE_KIND:
4828 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830}
4831
Alexander Belopolsky40018472011-02-26 01:02:56 +00004832PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004833PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4834 Py_ssize_t size,
4835 const char *errors)
4836{
4837 PyObject *v, *unicode;
4838
4839 unicode = PyUnicode_FromUnicode(s, size);
4840 if (unicode == NULL)
4841 return NULL;
4842 v = _PyUnicode_AsUTF8String(unicode, errors);
4843 Py_DECREF(unicode);
4844 return v;
4845}
4846
4847PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004848PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004850 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851}
4852
Walter Dörwald41980ca2007-08-16 21:55:45 +00004853/* --- UTF-32 Codec ------------------------------------------------------- */
4854
4855PyObject *
4856PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 Py_ssize_t size,
4858 const char *errors,
4859 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004860{
4861 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4862}
4863
4864PyObject *
4865PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 Py_ssize_t size,
4867 const char *errors,
4868 int *byteorder,
4869 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004870{
4871 const char *starts = s;
4872 Py_ssize_t startinpos;
4873 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004874 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004875 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004876 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004877 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004878 PyObject *errorHandler = NULL;
4879 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004880
Walter Dörwald41980ca2007-08-16 21:55:45 +00004881 q = (unsigned char *)s;
4882 e = q + size;
4883
4884 if (byteorder)
4885 bo = *byteorder;
4886
4887 /* Check for BOM marks (U+FEFF) in the input and adjust current
4888 byte order setting accordingly. In native mode, the leading BOM
4889 mark is skipped, in all other modes, it is copied to the output
4890 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004891 if (bo == 0 && size >= 4) {
4892 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4893 if (bom == 0x0000FEFF) {
4894 bo = -1;
4895 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004897 else if (bom == 0xFFFE0000) {
4898 bo = 1;
4899 q += 4;
4900 }
4901 if (byteorder)
4902 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004903 }
4904
Victor Stinnere64322e2012-10-30 23:12:47 +01004905 if (q == e) {
4906 if (consumed)
4907 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004908 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004909 }
4910
Victor Stinnere64322e2012-10-30 23:12:47 +01004911#ifdef WORDS_BIGENDIAN
4912 le = bo < 0;
4913#else
4914 le = bo <= 0;
4915#endif
4916
Victor Stinner8f674cc2013-04-17 23:02:17 +02004917 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004918 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4919 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004920
Victor Stinnere64322e2012-10-30 23:12:47 +01004921 while (1) {
4922 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004923 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004924
Victor Stinnere64322e2012-10-30 23:12:47 +01004925 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004926 enum PyUnicode_Kind kind = writer.kind;
4927 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004928 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004929 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004930 if (le) {
4931 do {
4932 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4933 if (ch > maxch)
4934 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004935 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004936 q += 4;
4937 } while (q <= last);
4938 }
4939 else {
4940 do {
4941 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4942 if (ch > maxch)
4943 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004944 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004945 q += 4;
4946 } while (q <= last);
4947 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004948 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004949 }
4950
4951 if (ch <= maxch) {
4952 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004954 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004956 startinpos = ((const char *)q) - starts;
4957 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004959 else {
4960 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004961 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01004962 goto onError;
4963 q += 4;
4964 continue;
4965 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004967 startinpos = ((const char *)q) - starts;
4968 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004970
4971 /* The remaining input chars are ignored if the callback
4972 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004973 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 errors, &errorHandler,
4975 "utf32", errmsg,
4976 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004977 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979 }
4980
Walter Dörwald41980ca2007-08-16 21:55:45 +00004981 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 Py_XDECREF(errorHandler);
4985 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004986 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004987
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004989 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990 Py_XDECREF(errorHandler);
4991 Py_XDECREF(exc);
4992 return NULL;
4993}
4994
4995PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004996_PyUnicode_EncodeUTF32(PyObject *str,
4997 const char *errors,
4998 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004999{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005000 int kind;
5001 void *data;
5002 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005003 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005005 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005007#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008 int iorder[] = {0, 1, 2, 3};
5009#else
5010 int iorder[] = {3, 2, 1, 0};
5011#endif
5012
Benjamin Peterson29060642009-01-31 22:14:21 +00005013#define STORECHAR(CH) \
5014 do { \
5015 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5016 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5017 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5018 p[iorder[0]] = (CH) & 0xff; \
5019 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005020 } while(0)
5021
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005022 if (!PyUnicode_Check(str)) {
5023 PyErr_BadArgument();
5024 return NULL;
5025 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005026 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005027 return NULL;
5028 kind = PyUnicode_KIND(str);
5029 data = PyUnicode_DATA(str);
5030 len = PyUnicode_GET_LENGTH(str);
5031
5032 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005033 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005035 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005036 if (v == NULL)
5037 return NULL;
5038
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005039 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005040 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005042 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005043 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044
5045 if (byteorder == -1) {
5046 /* force LE */
5047 iorder[0] = 0;
5048 iorder[1] = 1;
5049 iorder[2] = 2;
5050 iorder[3] = 3;
5051 }
5052 else if (byteorder == 1) {
5053 /* force BE */
5054 iorder[0] = 3;
5055 iorder[1] = 2;
5056 iorder[2] = 1;
5057 iorder[3] = 0;
5058 }
5059
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005060 for (i = 0; i < len; i++)
5061 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005062
5063 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005064 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065#undef STORECHAR
5066}
5067
Alexander Belopolsky40018472011-02-26 01:02:56 +00005068PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005069PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5070 Py_ssize_t size,
5071 const char *errors,
5072 int byteorder)
5073{
5074 PyObject *result;
5075 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5076 if (tmp == NULL)
5077 return NULL;
5078 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5079 Py_DECREF(tmp);
5080 return result;
5081}
5082
5083PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005084PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085{
Victor Stinnerb960b342011-11-20 19:12:52 +01005086 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087}
5088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089/* --- UTF-16 Codec ------------------------------------------------------- */
5090
Tim Peters772747b2001-08-09 22:21:55 +00005091PyObject *
5092PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 Py_ssize_t size,
5094 const char *errors,
5095 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096{
Walter Dörwald69652032004-09-07 20:24:22 +00005097 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5098}
5099
5100PyObject *
5101PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 Py_ssize_t size,
5103 const char *errors,
5104 int *byteorder,
5105 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005108 Py_ssize_t startinpos;
5109 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005110 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005111 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005112 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005113 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005114 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 PyObject *errorHandler = NULL;
5116 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117
Tim Peters772747b2001-08-09 22:21:55 +00005118 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005119 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120
5121 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005122 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005124 /* Check for BOM marks (U+FEFF) in the input and adjust current
5125 byte order setting accordingly. In native mode, the leading BOM
5126 mark is skipped, in all other modes, it is copied to the output
5127 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005128 if (bo == 0 && size >= 2) {
5129 const Py_UCS4 bom = (q[1] << 8) | q[0];
5130 if (bom == 0xFEFF) {
5131 q += 2;
5132 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005134 else if (bom == 0xFFFE) {
5135 q += 2;
5136 bo = 1;
5137 }
5138 if (byteorder)
5139 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141
Antoine Pitrou63065d72012-05-15 23:48:04 +02005142 if (q == e) {
5143 if (consumed)
5144 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005145 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005146 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005147
Christian Heimes743e0cd2012-10-17 23:52:17 +02005148#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005149 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005150#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005151 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005152#endif
Tim Peters772747b2001-08-09 22:21:55 +00005153
Antoine Pitrou63065d72012-05-15 23:48:04 +02005154 /* Note: size will always be longer than the resulting Unicode
5155 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005156 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005157 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5158 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005159
Antoine Pitrou63065d72012-05-15 23:48:04 +02005160 while (1) {
5161 Py_UCS4 ch = 0;
5162 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005163 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005164 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005165 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005166 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005167 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005168 native_ordering);
5169 else
5170 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005171 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005172 native_ordering);
5173 } else if (kind == PyUnicode_2BYTE_KIND) {
5174 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005175 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005176 native_ordering);
5177 } else {
5178 assert(kind == PyUnicode_4BYTE_KIND);
5179 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005180 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005181 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005182 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005183 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005184
Antoine Pitrou63065d72012-05-15 23:48:04 +02005185 switch (ch)
5186 {
5187 case 0:
5188 /* remaining byte at the end? (size should be even) */
5189 if (q == e || consumed)
5190 goto End;
5191 errmsg = "truncated data";
5192 startinpos = ((const char *)q) - starts;
5193 endinpos = ((const char *)e) - starts;
5194 break;
5195 /* The remaining input chars are ignored if the callback
5196 chooses to skip the input */
5197 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005198 q -= 2;
5199 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005200 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005201 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005202 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005203 endinpos = ((const char *)e) - starts;
5204 break;
5205 case 2:
5206 errmsg = "illegal encoding";
5207 startinpos = ((const char *)q) - 2 - starts;
5208 endinpos = startinpos + 2;
5209 break;
5210 case 3:
5211 errmsg = "illegal UTF-16 surrogate";
5212 startinpos = ((const char *)q) - 4 - starts;
5213 endinpos = startinpos + 2;
5214 break;
5215 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005216 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005217 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 continue;
5219 }
5220
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005221 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005222 errors,
5223 &errorHandler,
5224 "utf16", errmsg,
5225 &starts,
5226 (const char **)&e,
5227 &startinpos,
5228 &endinpos,
5229 &exc,
5230 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005231 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 }
5234
Antoine Pitrou63065d72012-05-15 23:48:04 +02005235End:
Walter Dörwald69652032004-09-07 20:24:22 +00005236 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005238
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 Py_XDECREF(errorHandler);
5240 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005241 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005245 Py_XDECREF(errorHandler);
5246 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 return NULL;
5248}
5249
Tim Peters772747b2001-08-09 22:21:55 +00005250PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005251_PyUnicode_EncodeUTF16(PyObject *str,
5252 const char *errors,
5253 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005255 enum PyUnicode_Kind kind;
5256 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005257 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005258 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005259 unsigned short *out;
5260 Py_ssize_t bytesize;
5261 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005262#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005263 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005264#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005265 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005266#endif
5267
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005268 if (!PyUnicode_Check(str)) {
5269 PyErr_BadArgument();
5270 return NULL;
5271 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005272 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005273 return NULL;
5274 kind = PyUnicode_KIND(str);
5275 data = PyUnicode_DATA(str);
5276 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005277
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005278 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005279 if (kind == PyUnicode_4BYTE_KIND) {
5280 const Py_UCS4 *in = (const Py_UCS4 *)data;
5281 const Py_UCS4 *end = in + len;
5282 while (in < end)
5283 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005284 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005285 }
5286 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005288 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005289 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 if (v == NULL)
5291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005293 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005294 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005295 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005297 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005298 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005299 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005300
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005301 switch (kind) {
5302 case PyUnicode_1BYTE_KIND: {
5303 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5304 break;
Tim Peters772747b2001-08-09 22:21:55 +00005305 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005306 case PyUnicode_2BYTE_KIND: {
5307 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5308 break;
Tim Peters772747b2001-08-09 22:21:55 +00005309 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005310 case PyUnicode_4BYTE_KIND: {
5311 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5312 break;
5313 }
5314 default:
5315 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005316 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005317
5318 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005319 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320}
5321
Alexander Belopolsky40018472011-02-26 01:02:56 +00005322PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005323PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5324 Py_ssize_t size,
5325 const char *errors,
5326 int byteorder)
5327{
5328 PyObject *result;
5329 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5330 if (tmp == NULL)
5331 return NULL;
5332 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5333 Py_DECREF(tmp);
5334 return result;
5335}
5336
5337PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005338PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005340 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341}
5342
5343/* --- Unicode Escape Codec ----------------------------------------------- */
5344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005345/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5346 if all the escapes in the string make it still a valid ASCII string.
5347 Returns -1 if any escapes were found which cause the string to
5348 pop out of ASCII range. Otherwise returns the length of the
5349 required buffer to hold the string.
5350 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005351static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005352length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5353{
5354 const unsigned char *p = (const unsigned char *)s;
5355 const unsigned char *end = p + size;
5356 Py_ssize_t length = 0;
5357
5358 if (size < 0)
5359 return -1;
5360
5361 for (; p < end; ++p) {
5362 if (*p > 127) {
5363 /* Non-ASCII */
5364 return -1;
5365 }
5366 else if (*p != '\\') {
5367 /* Normal character */
5368 ++length;
5369 }
5370 else {
5371 /* Backslash-escape, check next char */
5372 ++p;
5373 /* Escape sequence reaches till end of string or
5374 non-ASCII follow-up. */
5375 if (p >= end || *p > 127)
5376 return -1;
5377 switch (*p) {
5378 case '\n':
5379 /* backslash + \n result in zero characters */
5380 break;
5381 case '\\': case '\'': case '\"':
5382 case 'b': case 'f': case 't':
5383 case 'n': case 'r': case 'v': case 'a':
5384 ++length;
5385 break;
5386 case '0': case '1': case '2': case '3':
5387 case '4': case '5': case '6': case '7':
5388 case 'x': case 'u': case 'U': case 'N':
5389 /* these do not guarantee ASCII characters */
5390 return -1;
5391 default:
5392 /* count the backslash + the other character */
5393 length += 2;
5394 }
5395 }
5396 }
5397 return length;
5398}
5399
Fredrik Lundh06d12682001-01-24 07:59:11 +00005400static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005401
Alexander Belopolsky40018472011-02-26 01:02:56 +00005402PyObject *
5403PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005404 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005405 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005407 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005408 Py_ssize_t startinpos;
5409 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005410 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005412 char* message;
5413 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005414 PyObject *errorHandler = NULL;
5415 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005416 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005417
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005418 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005419 if (len == 0)
5420 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005421
5422 /* After length_of_escaped_ascii_string() there are two alternatives,
5423 either the string is pure ASCII with named escapes like \n, etc.
5424 and we determined it's exact size (common case)
5425 or it contains \x, \u, ... escape sequences. then we create a
5426 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005427 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005428 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005429 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005430 }
5431 else {
5432 /* Escaped strings will always be longer than the resulting
5433 Unicode string, so we start with size here and then reduce the
5434 length after conversion to the true value.
5435 (but if the error callback returns a long replacement string
5436 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005437 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 }
5439
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005441 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005443
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 while (s < end) {
5445 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005446 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448
5449 /* Non-escape characters are interpreted as Unicode ordinals */
5450 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005451 x = (unsigned char)*s;
5452 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005453 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005454 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 continue;
5456 }
5457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 /* \ - Escapes */
5460 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005461 c = *s++;
5462 if (s > end)
5463 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005464
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005465 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005468#define WRITECHAR(ch) \
5469 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005470 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005471 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005472 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005473
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005475 case '\\': WRITECHAR('\\'); break;
5476 case '\'': WRITECHAR('\''); break;
5477 case '\"': WRITECHAR('\"'); break;
5478 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005479 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005480 case 'f': WRITECHAR('\014'); break;
5481 case 't': WRITECHAR('\t'); break;
5482 case 'n': WRITECHAR('\n'); break;
5483 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005484 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005485 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005486 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005487 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 case '0': case '1': case '2': case '3':
5491 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005492 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005493 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005494 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005495 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005496 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005498 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 break;
5500
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 /* hex escapes */
5502 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005504 digits = 2;
5505 message = "truncated \\xXX escape";
5506 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005510 digits = 4;
5511 message = "truncated \\uXXXX escape";
5512 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005515 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005516 digits = 8;
5517 message = "truncated \\UXXXXXXXX escape";
5518 hexescape:
5519 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005520 if (end - s < digits) {
5521 /* count only hex digits */
5522 for (; s < end; ++s) {
5523 c = (unsigned char)*s;
5524 if (!Py_ISXDIGIT(c))
5525 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005526 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005527 goto error;
5528 }
5529 for (; digits--; ++s) {
5530 c = (unsigned char)*s;
5531 if (!Py_ISXDIGIT(c))
5532 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005533 chr = (chr<<4) & ~0xF;
5534 if (c >= '0' && c <= '9')
5535 chr += c - '0';
5536 else if (c >= 'a' && c <= 'f')
5537 chr += 10 + c - 'a';
5538 else
5539 chr += 10 + c - 'A';
5540 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005541 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005542 /* _decoding_error will have already written into the
5543 target buffer. */
5544 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005545 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005546 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005547 message = "illegal Unicode character";
5548 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005549 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005550 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005551 break;
5552
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 case 'N':
5555 message = "malformed \\N character escape";
5556 if (ucnhash_CAPI == NULL) {
5557 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5559 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005560 if (ucnhash_CAPI == NULL)
5561 goto ucnhashError;
5562 }
5563 if (*s == '{') {
5564 const char *start = s+1;
5565 /* look for the closing brace */
5566 while (*s != '}' && s < end)
5567 s++;
5568 if (s > start && s < end && *s == '}') {
5569 /* found a name. look it up in the unicode database */
5570 message = "unknown Unicode character name";
5571 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005572 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005573 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005574 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005575 goto store;
5576 }
5577 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005578 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005579
5580 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005581 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582 message = "\\ at end of string";
5583 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005584 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005585 }
5586 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005588 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005589 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005590 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005592 continue;
5593
5594 error:
5595 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005596 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005597 errors, &errorHandler,
5598 "unicodeescape", message,
5599 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005600 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005601 goto onError;
5602 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005604#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005605
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005606 Py_XDECREF(errorHandler);
5607 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005608 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005609
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005611 PyErr_SetString(
5612 PyExc_UnicodeError,
5613 "\\N escapes not supported (can't load unicodedata module)"
5614 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005615 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616 Py_XDECREF(errorHandler);
5617 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005618 return NULL;
5619
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 Py_XDECREF(errorHandler);
5623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 return NULL;
5625}
5626
5627/* Return a Unicode-Escape string version of the Unicode object.
5628
5629 If quotes is true, the string is enclosed in u"" or u'' quotes as
5630 appropriate.
5631
5632*/
5633
Alexander Belopolsky40018472011-02-26 01:02:56 +00005634PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005635PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005637 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005638 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005640 int kind;
5641 void *data;
5642 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643
Ezio Melottie7f90372012-10-05 03:33:31 +03005644 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005645 escape.
5646
Ezio Melottie7f90372012-10-05 03:33:31 +03005647 For UCS1 strings it's '\xxx', 4 bytes per source character.
5648 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5649 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005650 */
5651
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005652 if (!PyUnicode_Check(unicode)) {
5653 PyErr_BadArgument();
5654 return NULL;
5655 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005656 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 return NULL;
5658 len = PyUnicode_GET_LENGTH(unicode);
5659 kind = PyUnicode_KIND(unicode);
5660 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005661 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005662 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5663 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5664 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5665 }
5666
5667 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005668 return PyBytes_FromStringAndSize(NULL, 0);
5669
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005670 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005672
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005673 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005675 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 if (repr == NULL)
5678 return NULL;
5679
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005680 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005682 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005683 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005684
Walter Dörwald79e913e2007-05-12 11:08:06 +00005685 /* Escape backslashes */
5686 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 *p++ = '\\';
5688 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005689 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005690 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005691
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005692 /* Map 21-bit characters to '\U00xxxxxx' */
5693 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005694 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005695 *p++ = '\\';
5696 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005697 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5698 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5699 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5700 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5701 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5702 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5703 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5704 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005706 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005707
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005709 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 *p++ = '\\';
5711 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005712 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5713 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5714 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5715 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005717
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005718 /* Map special whitespace to '\t', \n', '\r' */
5719 else if (ch == '\t') {
5720 *p++ = '\\';
5721 *p++ = 't';
5722 }
5723 else if (ch == '\n') {
5724 *p++ = '\\';
5725 *p++ = 'n';
5726 }
5727 else if (ch == '\r') {
5728 *p++ = '\\';
5729 *p++ = 'r';
5730 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005731
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005732 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005733 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005735 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005736 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5737 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005738 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005739
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 /* Copy everything else as-is */
5741 else
5742 *p++ = (char) ch;
5743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005745 assert(p - PyBytes_AS_STRING(repr) > 0);
5746 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5747 return NULL;
5748 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749}
5750
Alexander Belopolsky40018472011-02-26 01:02:56 +00005751PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5753 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 PyObject *result;
5756 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5757 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005759 result = PyUnicode_AsUnicodeEscapeString(tmp);
5760 Py_DECREF(tmp);
5761 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762}
5763
5764/* --- Raw Unicode Escape Codec ------------------------------------------- */
5765
Alexander Belopolsky40018472011-02-26 01:02:56 +00005766PyObject *
5767PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005768 Py_ssize_t size,
5769 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005772 Py_ssize_t startinpos;
5773 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005774 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 const char *end;
5776 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 PyObject *errorHandler = NULL;
5778 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005779
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005780 if (size == 0)
5781 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005782
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 /* Escaped strings will always be longer than the resulting
5784 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 length after conversion to the true value. (But decoding error
5786 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005787 _PyUnicodeWriter_Init(&writer);
5788 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 end = s + size;
5791 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 unsigned char c;
5793 Py_UCS4 x;
5794 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005795 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 /* Non-escape characters are interpreted as Unicode ordinals */
5798 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005799 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005800 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005801 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005803 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 startinpos = s-starts;
5805
5806 /* \u-escapes are only interpreted iff the number of leading
5807 backslashes if odd */
5808 bs = s;
5809 for (;s < end;) {
5810 if (*s != '\\')
5811 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005812 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005813 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005814 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 }
5816 if (((s - bs) & 1) == 0 ||
5817 s >= end ||
5818 (*s != 'u' && *s != 'U')) {
5819 continue;
5820 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005821 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 count = *s=='u' ? 4 : 8;
5823 s++;
5824
5825 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 for (x = 0, i = 0; i < count; ++i, ++s) {
5827 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005828 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005830 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 errors, &errorHandler,
5832 "rawunicodeescape", "truncated \\uXXXX",
5833 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005834 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 goto onError;
5836 goto nextByte;
5837 }
5838 x = (x<<4) & ~0xF;
5839 if (c >= '0' && c <= '9')
5840 x += c - '0';
5841 else if (c >= 'a' && c <= 'f')
5842 x += 10 + c - 'a';
5843 else
5844 x += 10 + c - 'A';
5845 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005846 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005847 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005848 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 }
5850 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005851 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005852 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005853 errors, &errorHandler,
5854 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005856 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005858 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 nextByte:
5860 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 Py_XDECREF(errorHandler);
5863 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005864 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005865
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005867 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 Py_XDECREF(errorHandler);
5869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 return NULL;
5871}
5872
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005873
Alexander Belopolsky40018472011-02-26 01:02:56 +00005874PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005877 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 char *p;
5879 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005880 Py_ssize_t expandsize, pos;
5881 int kind;
5882 void *data;
5883 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885 if (!PyUnicode_Check(unicode)) {
5886 PyErr_BadArgument();
5887 return NULL;
5888 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005889 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890 return NULL;
5891 kind = PyUnicode_KIND(unicode);
5892 data = PyUnicode_DATA(unicode);
5893 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005894 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5895 bytes, and 1 byte characters 4. */
5896 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005897
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005898 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005900
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005901 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 if (repr == NULL)
5903 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005905 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005907 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908 for (pos = 0; pos < len; pos++) {
5909 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 /* Map 32-bit characters to '\Uxxxxxxxx' */
5911 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005912 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005913 *p++ = '\\';
5914 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005915 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5916 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5917 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5918 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5919 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5920 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5921 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5922 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005923 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005925 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 *p++ = '\\';
5927 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005928 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5929 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5930 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5931 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 /* Copy everything else as-is */
5934 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 *p++ = (char) ch;
5936 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005937
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005938 assert(p > q);
5939 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005940 return NULL;
5941 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942}
5943
Alexander Belopolsky40018472011-02-26 01:02:56 +00005944PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005945PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5946 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005948 PyObject *result;
5949 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5950 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005951 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5953 Py_DECREF(tmp);
5954 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955}
5956
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005957/* --- Unicode Internal Codec ------------------------------------------- */
5958
Alexander Belopolsky40018472011-02-26 01:02:56 +00005959PyObject *
5960_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005961 Py_ssize_t size,
5962 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005963{
5964 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005965 Py_ssize_t startinpos;
5966 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005967 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005968 const char *end;
5969 const char *reason;
5970 PyObject *errorHandler = NULL;
5971 PyObject *exc = NULL;
5972
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005973 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005974 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005975 1))
5976 return NULL;
5977
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005978 if (size == 0)
5979 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005980
Victor Stinner8f674cc2013-04-17 23:02:17 +02005981 _PyUnicodeWriter_Init(&writer);
5982 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
5983 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02005985 }
5986 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005987
Victor Stinner8f674cc2013-04-17 23:02:17 +02005988 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005989 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005990 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005991 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02005992 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02005993 endinpos = end-starts;
5994 reason = "truncated input";
5995 goto error;
5996 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005997 /* We copy the raw representation one byte at a time because the
5998 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005999 ((char *) &uch)[0] = s[0];
6000 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006001#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006002 ((char *) &uch)[2] = s[2];
6003 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006004#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006005 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006006#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006007 /* We have to sanity check the raw data, otherwise doom looms for
6008 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006009 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006010 endinpos = s - starts + Py_UNICODE_SIZE;
6011 reason = "illegal code point (> 0x10FFFF)";
6012 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006013 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006014#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006015 s += Py_UNICODE_SIZE;
6016#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006017 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006018 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006019 Py_UNICODE uch2;
6020 ((char *) &uch2)[0] = s[0];
6021 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006022 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006023 {
Victor Stinner551ac952011-11-29 22:58:13 +01006024 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006025 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006026 }
6027 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006028#endif
6029
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006030 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006031 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006032 continue;
6033
6034 error:
6035 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006036 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006037 errors, &errorHandler,
6038 "unicode_internal", reason,
6039 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006040 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006041 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006042 }
6043
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006044 Py_XDECREF(errorHandler);
6045 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006046 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006047
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006049 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006050 Py_XDECREF(errorHandler);
6051 Py_XDECREF(exc);
6052 return NULL;
6053}
6054
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055/* --- Latin-1 Codec ------------------------------------------------------ */
6056
Alexander Belopolsky40018472011-02-26 01:02:56 +00006057PyObject *
6058PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006059 Py_ssize_t size,
6060 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006063 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064}
6065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006067static void
6068make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006069 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006070 PyObject *unicode,
6071 Py_ssize_t startpos, Py_ssize_t endpos,
6072 const char *reason)
6073{
6074 if (*exceptionObject == NULL) {
6075 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006076 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006077 encoding, unicode, startpos, endpos, reason);
6078 }
6079 else {
6080 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6081 goto onError;
6082 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6083 goto onError;
6084 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6085 goto onError;
6086 return;
6087 onError:
6088 Py_DECREF(*exceptionObject);
6089 *exceptionObject = NULL;
6090 }
6091}
6092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006094static void
6095raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006096 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006097 PyObject *unicode,
6098 Py_ssize_t startpos, Py_ssize_t endpos,
6099 const char *reason)
6100{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006101 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006102 encoding, unicode, startpos, endpos, reason);
6103 if (*exceptionObject != NULL)
6104 PyCodec_StrictErrors(*exceptionObject);
6105}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106
6107/* error handling callback helper:
6108 build arguments, call the callback and check the arguments,
6109 put the result into newpos and return the replacement string, which
6110 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111static PyObject *
6112unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006113 PyObject **errorHandler,
6114 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006115 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006116 Py_ssize_t startpos, Py_ssize_t endpos,
6117 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006119 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006120 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 PyObject *restuple;
6122 PyObject *resunicode;
6123
6124 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 }
6129
Benjamin Petersonbac79492012-01-14 13:34:47 -05006130 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006131 return NULL;
6132 len = PyUnicode_GET_LENGTH(unicode);
6133
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006134 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006135 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138
6139 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006144 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 Py_DECREF(restuple);
6146 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006148 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 &resunicode, newpos)) {
6150 Py_DECREF(restuple);
6151 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006153 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6154 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6155 Py_DECREF(restuple);
6156 return NULL;
6157 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006159 *newpos = len + *newpos;
6160 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6162 Py_DECREF(restuple);
6163 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006164 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165 Py_INCREF(resunicode);
6166 Py_DECREF(restuple);
6167 return resunicode;
6168}
6169
Alexander Belopolsky40018472011-02-26 01:02:56 +00006170static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006171unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006172 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006173 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006175 /* input state */
6176 Py_ssize_t pos=0, size;
6177 int kind;
6178 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 /* output object */
6180 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 /* pointer into the output */
6182 char *str;
6183 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006184 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006185 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6186 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006187 PyObject *errorHandler = NULL;
6188 PyObject *exc = NULL;
6189 /* the following variable is used for caching string comparisons
6190 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6191 int known_errorHandler = -1;
6192
Benjamin Petersonbac79492012-01-14 13:34:47 -05006193 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006194 return NULL;
6195 size = PyUnicode_GET_LENGTH(unicode);
6196 kind = PyUnicode_KIND(unicode);
6197 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006198 /* allocate enough for a simple encoding without
6199 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006200 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006201 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006202 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006204 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006205 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006206 ressize = size;
6207
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006208 while (pos < size) {
6209 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006210
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 /* can we encode this? */
6212 if (c<limit) {
6213 /* no overflow check, because we know that the space is enough */
6214 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006215 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006216 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 Py_ssize_t requiredsize;
6219 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006220 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006222 Py_ssize_t collstart = pos;
6223 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006225 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 ++collend;
6227 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6228 if (known_errorHandler==-1) {
6229 if ((errors==NULL) || (!strcmp(errors, "strict")))
6230 known_errorHandler = 1;
6231 else if (!strcmp(errors, "replace"))
6232 known_errorHandler = 2;
6233 else if (!strcmp(errors, "ignore"))
6234 known_errorHandler = 3;
6235 else if (!strcmp(errors, "xmlcharrefreplace"))
6236 known_errorHandler = 4;
6237 else
6238 known_errorHandler = 0;
6239 }
6240 switch (known_errorHandler) {
6241 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006242 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 goto onError;
6244 case 2: /* replace */
6245 while (collstart++<collend)
6246 *str++ = '?'; /* fall through */
6247 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006248 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 break;
6250 case 4: /* xmlcharrefreplace */
6251 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006252 /* determine replacement size */
6253 for (i = collstart, repsize = 0; i < collend; ++i) {
6254 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6255 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006257 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006259 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006261 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006263 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006265 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006267 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006268 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006270 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006272 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 if (requiredsize > ressize) {
6274 if (requiredsize<2*ressize)
6275 requiredsize = 2*ressize;
6276 if (_PyBytes_Resize(&res, requiredsize))
6277 goto onError;
6278 str = PyBytes_AS_STRING(res) + respos;
6279 ressize = requiredsize;
6280 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006281 /* generate replacement */
6282 for (i = collstart; i < collend; ++i) {
6283 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006285 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 break;
6287 default:
6288 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006289 encoding, reason, unicode, &exc,
6290 collstart, collend, &newpos);
6291 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006292 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006294 if (PyBytes_Check(repunicode)) {
6295 /* Directly copy bytes result to output. */
6296 repsize = PyBytes_Size(repunicode);
6297 if (repsize > 1) {
6298 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006299 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006300 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6301 Py_DECREF(repunicode);
6302 goto onError;
6303 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006304 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006305 ressize += repsize-1;
6306 }
6307 memcpy(str, PyBytes_AsString(repunicode), repsize);
6308 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006309 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006310 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006311 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006312 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 /* need more space? (at least enough for what we
6314 have+the replacement+the rest of the string, so
6315 we won't have to check space for encodable characters) */
6316 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006317 repsize = PyUnicode_GET_LENGTH(repunicode);
6318 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 if (requiredsize > ressize) {
6320 if (requiredsize<2*ressize)
6321 requiredsize = 2*ressize;
6322 if (_PyBytes_Resize(&res, requiredsize)) {
6323 Py_DECREF(repunicode);
6324 goto onError;
6325 }
6326 str = PyBytes_AS_STRING(res) + respos;
6327 ressize = requiredsize;
6328 }
6329 /* check if there is anything unencodable in the replacement
6330 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006331 for (i = 0; repsize-->0; ++i, ++str) {
6332 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006334 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006335 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 Py_DECREF(repunicode);
6337 goto onError;
6338 }
6339 *str = (char)c;
6340 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006341 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006342 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006343 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006344 }
6345 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006346 /* Resize if we allocated to much */
6347 size = str - PyBytes_AS_STRING(res);
6348 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006349 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006350 if (_PyBytes_Resize(&res, size) < 0)
6351 goto onError;
6352 }
6353
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354 Py_XDECREF(errorHandler);
6355 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006356 return res;
6357
6358 onError:
6359 Py_XDECREF(res);
6360 Py_XDECREF(errorHandler);
6361 Py_XDECREF(exc);
6362 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363}
6364
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006365/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006366PyObject *
6367PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006368 Py_ssize_t size,
6369 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006371 PyObject *result;
6372 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6373 if (unicode == NULL)
6374 return NULL;
6375 result = unicode_encode_ucs1(unicode, errors, 256);
6376 Py_DECREF(unicode);
6377 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378}
6379
Alexander Belopolsky40018472011-02-26 01:02:56 +00006380PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006381_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382{
6383 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 PyErr_BadArgument();
6385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006387 if (PyUnicode_READY(unicode) == -1)
6388 return NULL;
6389 /* Fast path: if it is a one-byte string, construct
6390 bytes object directly. */
6391 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6392 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6393 PyUnicode_GET_LENGTH(unicode));
6394 /* Non-Latin-1 characters present. Defer to above function to
6395 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006396 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006397}
6398
6399PyObject*
6400PyUnicode_AsLatin1String(PyObject *unicode)
6401{
6402 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403}
6404
6405/* --- 7-bit ASCII Codec -------------------------------------------------- */
6406
Alexander Belopolsky40018472011-02-26 01:02:56 +00006407PyObject *
6408PyUnicode_DecodeASCII(const char *s,
6409 Py_ssize_t size,
6410 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006413 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006414 int kind;
6415 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006416 Py_ssize_t startinpos;
6417 Py_ssize_t endinpos;
6418 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 const char *e;
6420 PyObject *errorHandler = NULL;
6421 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006422
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006424 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006425
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006427 if (size == 1 && (unsigned char)s[0] < 128)
6428 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006429
Victor Stinner8f674cc2013-04-17 23:02:17 +02006430 _PyUnicodeWriter_Init(&writer);
6431 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0)
6432 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006433
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006435 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006436 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006437 writer.pos = outpos;
6438 if (writer.pos == size)
6439 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006440
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006441 s += writer.pos;
6442 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 register unsigned char c = (unsigned char)*s;
6445 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006446 PyUnicode_WRITE(kind, data, writer.pos, c);
6447 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 ++s;
6449 }
6450 else {
6451 startinpos = s-starts;
6452 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006453 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 errors, &errorHandler,
6455 "ascii", "ordinal not in range(128)",
6456 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006457 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006459 kind = writer.kind;
6460 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 Py_XDECREF(errorHandler);
6464 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006465 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006466
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006468 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006469 Py_XDECREF(errorHandler);
6470 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 return NULL;
6472}
6473
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006475PyObject *
6476PyUnicode_EncodeASCII(const Py_UNICODE *p,
6477 Py_ssize_t size,
6478 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006480 PyObject *result;
6481 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6482 if (unicode == NULL)
6483 return NULL;
6484 result = unicode_encode_ucs1(unicode, errors, 128);
6485 Py_DECREF(unicode);
6486 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487}
6488
Alexander Belopolsky40018472011-02-26 01:02:56 +00006489PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006490_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491{
6492 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 PyErr_BadArgument();
6494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006496 if (PyUnicode_READY(unicode) == -1)
6497 return NULL;
6498 /* Fast path: if it is an ASCII-only string, construct bytes object
6499 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006500 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006501 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6502 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006504}
6505
6506PyObject *
6507PyUnicode_AsASCIIString(PyObject *unicode)
6508{
6509 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510}
6511
Victor Stinner99b95382011-07-04 14:23:54 +02006512#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006513
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006514/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006515
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006516#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006517#define NEED_RETRY
6518#endif
6519
Victor Stinner3a50e702011-10-18 21:21:00 +02006520#ifndef WC_ERR_INVALID_CHARS
6521# define WC_ERR_INVALID_CHARS 0x0080
6522#endif
6523
6524static char*
6525code_page_name(UINT code_page, PyObject **obj)
6526{
6527 *obj = NULL;
6528 if (code_page == CP_ACP)
6529 return "mbcs";
6530 if (code_page == CP_UTF7)
6531 return "CP_UTF7";
6532 if (code_page == CP_UTF8)
6533 return "CP_UTF8";
6534
6535 *obj = PyBytes_FromFormat("cp%u", code_page);
6536 if (*obj == NULL)
6537 return NULL;
6538 return PyBytes_AS_STRING(*obj);
6539}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006540
Alexander Belopolsky40018472011-02-26 01:02:56 +00006541static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006542is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006543{
6544 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006545 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006546
Victor Stinner3a50e702011-10-18 21:21:00 +02006547 if (!IsDBCSLeadByteEx(code_page, *curr))
6548 return 0;
6549
6550 prev = CharPrevExA(code_page, s, curr, 0);
6551 if (prev == curr)
6552 return 1;
6553 /* FIXME: This code is limited to "true" double-byte encodings,
6554 as it assumes an incomplete character consists of a single
6555 byte. */
6556 if (curr - prev == 2)
6557 return 1;
6558 if (!IsDBCSLeadByteEx(code_page, *prev))
6559 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006560 return 0;
6561}
6562
Victor Stinner3a50e702011-10-18 21:21:00 +02006563static DWORD
6564decode_code_page_flags(UINT code_page)
6565{
6566 if (code_page == CP_UTF7) {
6567 /* The CP_UTF7 decoder only supports flags=0 */
6568 return 0;
6569 }
6570 else
6571 return MB_ERR_INVALID_CHARS;
6572}
6573
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006574/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006575 * Decode a byte string from a Windows code page into unicode object in strict
6576 * mode.
6577 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006578 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6579 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006580 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006581static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006582decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006583 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006584 const char *in,
6585 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006586{
Victor Stinner3a50e702011-10-18 21:21:00 +02006587 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006588 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006589 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006590
6591 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006592 assert(insize > 0);
6593 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6594 if (outsize <= 0)
6595 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006596
6597 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006599 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006600 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 if (*v == NULL)
6602 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006603 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006604 }
6605 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006607 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006608 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006610 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006611 }
6612
6613 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006614 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6615 if (outsize <= 0)
6616 goto error;
6617 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006618
Victor Stinner3a50e702011-10-18 21:21:00 +02006619error:
6620 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6621 return -2;
6622 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006623 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006624}
6625
Victor Stinner3a50e702011-10-18 21:21:00 +02006626/*
6627 * Decode a byte string from a code page into unicode object with an error
6628 * handler.
6629 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006630 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006631 * UnicodeDecodeError exception and returns -1 on error.
6632 */
6633static int
6634decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006635 PyObject **v,
6636 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006637 const char *errors)
6638{
6639 const char *startin = in;
6640 const char *endin = in + size;
6641 const DWORD flags = decode_code_page_flags(code_page);
6642 /* Ideally, we should get reason from FormatMessage. This is the Windows
6643 2000 English version of the message. */
6644 const char *reason = "No mapping for the Unicode character exists "
6645 "in the target code page.";
6646 /* each step cannot decode more than 1 character, but a character can be
6647 represented as a surrogate pair */
6648 wchar_t buffer[2], *startout, *out;
6649 int insize, outsize;
6650 PyObject *errorHandler = NULL;
6651 PyObject *exc = NULL;
6652 PyObject *encoding_obj = NULL;
6653 char *encoding;
6654 DWORD err;
6655 int ret = -1;
6656
6657 assert(size > 0);
6658
6659 encoding = code_page_name(code_page, &encoding_obj);
6660 if (encoding == NULL)
6661 return -1;
6662
6663 if (errors == NULL || strcmp(errors, "strict") == 0) {
6664 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6665 UnicodeDecodeError. */
6666 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6667 if (exc != NULL) {
6668 PyCodec_StrictErrors(exc);
6669 Py_CLEAR(exc);
6670 }
6671 goto error;
6672 }
6673
6674 if (*v == NULL) {
6675 /* Create unicode object */
6676 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6677 PyErr_NoMemory();
6678 goto error;
6679 }
Victor Stinnerab595942011-12-17 04:59:06 +01006680 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006681 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006682 if (*v == NULL)
6683 goto error;
6684 startout = PyUnicode_AS_UNICODE(*v);
6685 }
6686 else {
6687 /* Extend unicode object */
6688 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6689 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6690 PyErr_NoMemory();
6691 goto error;
6692 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006693 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006694 goto error;
6695 startout = PyUnicode_AS_UNICODE(*v) + n;
6696 }
6697
6698 /* Decode the byte string character per character */
6699 out = startout;
6700 while (in < endin)
6701 {
6702 /* Decode a character */
6703 insize = 1;
6704 do
6705 {
6706 outsize = MultiByteToWideChar(code_page, flags,
6707 in, insize,
6708 buffer, Py_ARRAY_LENGTH(buffer));
6709 if (outsize > 0)
6710 break;
6711 err = GetLastError();
6712 if (err != ERROR_NO_UNICODE_TRANSLATION
6713 && err != ERROR_INSUFFICIENT_BUFFER)
6714 {
6715 PyErr_SetFromWindowsErr(0);
6716 goto error;
6717 }
6718 insize++;
6719 }
6720 /* 4=maximum length of a UTF-8 sequence */
6721 while (insize <= 4 && (in + insize) <= endin);
6722
6723 if (outsize <= 0) {
6724 Py_ssize_t startinpos, endinpos, outpos;
6725
6726 startinpos = in - startin;
6727 endinpos = startinpos + 1;
6728 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006729 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006730 errors, &errorHandler,
6731 encoding, reason,
6732 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006733 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006734 {
6735 goto error;
6736 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006737 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006738 }
6739 else {
6740 in += insize;
6741 memcpy(out, buffer, outsize * sizeof(wchar_t));
6742 out += outsize;
6743 }
6744 }
6745
6746 /* write a NUL character at the end */
6747 *out = 0;
6748
6749 /* Extend unicode object */
6750 outsize = out - startout;
6751 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006752 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006753 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006754 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006755
6756error:
6757 Py_XDECREF(encoding_obj);
6758 Py_XDECREF(errorHandler);
6759 Py_XDECREF(exc);
6760 return ret;
6761}
6762
Victor Stinner3a50e702011-10-18 21:21:00 +02006763static PyObject *
6764decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006765 const char *s, Py_ssize_t size,
6766 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006767{
Victor Stinner76a31a62011-11-04 00:05:13 +01006768 PyObject *v = NULL;
6769 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006770
Victor Stinner3a50e702011-10-18 21:21:00 +02006771 if (code_page < 0) {
6772 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6773 return NULL;
6774 }
6775
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006776 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006778
Victor Stinner76a31a62011-11-04 00:05:13 +01006779 do
6780 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006781#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006782 if (size > INT_MAX) {
6783 chunk_size = INT_MAX;
6784 final = 0;
6785 done = 0;
6786 }
6787 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006788#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006789 {
6790 chunk_size = (int)size;
6791 final = (consumed == NULL);
6792 done = 1;
6793 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794
Victor Stinner76a31a62011-11-04 00:05:13 +01006795 /* Skip trailing lead-byte unless 'final' is set */
6796 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6797 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798
Victor Stinner76a31a62011-11-04 00:05:13 +01006799 if (chunk_size == 0 && done) {
6800 if (v != NULL)
6801 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006802 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006803 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006804
Victor Stinner76a31a62011-11-04 00:05:13 +01006805
6806 converted = decode_code_page_strict(code_page, &v,
6807 s, chunk_size);
6808 if (converted == -2)
6809 converted = decode_code_page_errors(code_page, &v,
6810 s, chunk_size,
6811 errors);
6812 assert(converted != 0);
6813
6814 if (converted < 0) {
6815 Py_XDECREF(v);
6816 return NULL;
6817 }
6818
6819 if (consumed)
6820 *consumed += converted;
6821
6822 s += converted;
6823 size -= converted;
6824 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006825
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006826 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006827}
6828
Alexander Belopolsky40018472011-02-26 01:02:56 +00006829PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006830PyUnicode_DecodeCodePageStateful(int code_page,
6831 const char *s,
6832 Py_ssize_t size,
6833 const char *errors,
6834 Py_ssize_t *consumed)
6835{
6836 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6837}
6838
6839PyObject *
6840PyUnicode_DecodeMBCSStateful(const char *s,
6841 Py_ssize_t size,
6842 const char *errors,
6843 Py_ssize_t *consumed)
6844{
6845 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6846}
6847
6848PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006849PyUnicode_DecodeMBCS(const char *s,
6850 Py_ssize_t size,
6851 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006852{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6854}
6855
Victor Stinner3a50e702011-10-18 21:21:00 +02006856static DWORD
6857encode_code_page_flags(UINT code_page, const char *errors)
6858{
6859 if (code_page == CP_UTF8) {
6860 if (winver.dwMajorVersion >= 6)
6861 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6862 and later */
6863 return WC_ERR_INVALID_CHARS;
6864 else
6865 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6866 return 0;
6867 }
6868 else if (code_page == CP_UTF7) {
6869 /* CP_UTF7 only supports flags=0 */
6870 return 0;
6871 }
6872 else {
6873 if (errors != NULL && strcmp(errors, "replace") == 0)
6874 return 0;
6875 else
6876 return WC_NO_BEST_FIT_CHARS;
6877 }
6878}
6879
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006881 * Encode a Unicode string to a Windows code page into a byte string in strict
6882 * mode.
6883 *
6884 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006885 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006887static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006888encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006889 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006890 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006891{
Victor Stinner554f3f02010-06-16 23:33:54 +00006892 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006893 BOOL *pusedDefaultChar = &usedDefaultChar;
6894 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006895 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006896 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006897 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006898 const DWORD flags = encode_code_page_flags(code_page, NULL);
6899 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006900 /* Create a substring so that we can get the UTF-16 representation
6901 of just the slice under consideration. */
6902 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903
Martin v. Löwis3d325192011-11-04 18:23:06 +01006904 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006905
Victor Stinner3a50e702011-10-18 21:21:00 +02006906 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006907 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006909 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006910
Victor Stinner2fc507f2011-11-04 20:06:39 +01006911 substring = PyUnicode_Substring(unicode, offset, offset+len);
6912 if (substring == NULL)
6913 return -1;
6914 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6915 if (p == NULL) {
6916 Py_DECREF(substring);
6917 return -1;
6918 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006919
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006920 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 outsize = WideCharToMultiByte(code_page, flags,
6922 p, size,
6923 NULL, 0,
6924 NULL, pusedDefaultChar);
6925 if (outsize <= 0)
6926 goto error;
6927 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006928 if (pusedDefaultChar && *pusedDefaultChar) {
6929 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006931 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006932
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006935 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006936 if (*outbytes == NULL) {
6937 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006939 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006941 }
6942 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006944 const Py_ssize_t n = PyBytes_Size(*outbytes);
6945 if (outsize > PY_SSIZE_T_MAX - n) {
6946 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006947 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006949 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006950 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6951 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006952 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006953 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006954 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955 }
6956
6957 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006958 outsize = WideCharToMultiByte(code_page, flags,
6959 p, size,
6960 out, outsize,
6961 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006962 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006963 if (outsize <= 0)
6964 goto error;
6965 if (pusedDefaultChar && *pusedDefaultChar)
6966 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006967 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006968
Victor Stinner3a50e702011-10-18 21:21:00 +02006969error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006970 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6972 return -2;
6973 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006974 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006975}
6976
Victor Stinner3a50e702011-10-18 21:21:00 +02006977/*
6978 * Encode a Unicode string to a Windows code page into a byte string using a
6979 * error handler.
6980 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006981 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02006982 * -1 on other error.
6983 */
6984static int
6985encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01006986 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006987 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006988{
Victor Stinner3a50e702011-10-18 21:21:00 +02006989 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006990 Py_ssize_t pos = unicode_offset;
6991 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 /* Ideally, we should get reason from FormatMessage. This is the Windows
6993 2000 English version of the message. */
6994 const char *reason = "invalid character";
6995 /* 4=maximum length of a UTF-8 sequence */
6996 char buffer[4];
6997 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
6998 Py_ssize_t outsize;
6999 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 PyObject *errorHandler = NULL;
7001 PyObject *exc = NULL;
7002 PyObject *encoding_obj = NULL;
7003 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007004 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 PyObject *rep;
7006 int ret = -1;
7007
7008 assert(insize > 0);
7009
7010 encoding = code_page_name(code_page, &encoding_obj);
7011 if (encoding == NULL)
7012 return -1;
7013
7014 if (errors == NULL || strcmp(errors, "strict") == 0) {
7015 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7016 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007017 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 if (exc != NULL) {
7019 PyCodec_StrictErrors(exc);
7020 Py_DECREF(exc);
7021 }
7022 Py_XDECREF(encoding_obj);
7023 return -1;
7024 }
7025
7026 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7027 pusedDefaultChar = &usedDefaultChar;
7028 else
7029 pusedDefaultChar = NULL;
7030
7031 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7032 PyErr_NoMemory();
7033 goto error;
7034 }
7035 outsize = insize * Py_ARRAY_LENGTH(buffer);
7036
7037 if (*outbytes == NULL) {
7038 /* Create string object */
7039 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7040 if (*outbytes == NULL)
7041 goto error;
7042 out = PyBytes_AS_STRING(*outbytes);
7043 }
7044 else {
7045 /* Extend string object */
7046 Py_ssize_t n = PyBytes_Size(*outbytes);
7047 if (n > PY_SSIZE_T_MAX - outsize) {
7048 PyErr_NoMemory();
7049 goto error;
7050 }
7051 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7052 goto error;
7053 out = PyBytes_AS_STRING(*outbytes) + n;
7054 }
7055
7056 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007057 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007059 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7060 wchar_t chars[2];
7061 int charsize;
7062 if (ch < 0x10000) {
7063 chars[0] = (wchar_t)ch;
7064 charsize = 1;
7065 }
7066 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007067 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7068 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007069 charsize = 2;
7070 }
7071
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007073 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 buffer, Py_ARRAY_LENGTH(buffer),
7075 NULL, pusedDefaultChar);
7076 if (outsize > 0) {
7077 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7078 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007079 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007080 memcpy(out, buffer, outsize);
7081 out += outsize;
7082 continue;
7083 }
7084 }
7085 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7086 PyErr_SetFromWindowsErr(0);
7087 goto error;
7088 }
7089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 rep = unicode_encode_call_errorhandler(
7091 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007092 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007093 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 if (rep == NULL)
7095 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007096 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007097
7098 if (PyBytes_Check(rep)) {
7099 outsize = PyBytes_GET_SIZE(rep);
7100 if (outsize != 1) {
7101 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7102 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7103 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7104 Py_DECREF(rep);
7105 goto error;
7106 }
7107 out = PyBytes_AS_STRING(*outbytes) + offset;
7108 }
7109 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7110 out += outsize;
7111 }
7112 else {
7113 Py_ssize_t i;
7114 enum PyUnicode_Kind kind;
7115 void *data;
7116
Benjamin Petersonbac79492012-01-14 13:34:47 -05007117 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 Py_DECREF(rep);
7119 goto error;
7120 }
7121
7122 outsize = PyUnicode_GET_LENGTH(rep);
7123 if (outsize != 1) {
7124 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7125 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7126 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7127 Py_DECREF(rep);
7128 goto error;
7129 }
7130 out = PyBytes_AS_STRING(*outbytes) + offset;
7131 }
7132 kind = PyUnicode_KIND(rep);
7133 data = PyUnicode_DATA(rep);
7134 for (i=0; i < outsize; i++) {
7135 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7136 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007137 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007138 encoding, unicode,
7139 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 "unable to encode error handler result to ASCII");
7141 Py_DECREF(rep);
7142 goto error;
7143 }
7144 *out = (unsigned char)ch;
7145 out++;
7146 }
7147 }
7148 Py_DECREF(rep);
7149 }
7150 /* write a NUL byte */
7151 *out = 0;
7152 outsize = out - PyBytes_AS_STRING(*outbytes);
7153 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7154 if (_PyBytes_Resize(outbytes, outsize) < 0)
7155 goto error;
7156 ret = 0;
7157
7158error:
7159 Py_XDECREF(encoding_obj);
7160 Py_XDECREF(errorHandler);
7161 Py_XDECREF(exc);
7162 return ret;
7163}
7164
Victor Stinner3a50e702011-10-18 21:21:00 +02007165static PyObject *
7166encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007167 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 const char *errors)
7169{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007170 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007172 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007173 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007174
Benjamin Petersonbac79492012-01-14 13:34:47 -05007175 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007176 return NULL;
7177 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007178
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 if (code_page < 0) {
7180 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7181 return NULL;
7182 }
7183
Martin v. Löwis3d325192011-11-04 18:23:06 +01007184 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007185 return PyBytes_FromStringAndSize(NULL, 0);
7186
Victor Stinner7581cef2011-11-03 22:32:33 +01007187 offset = 0;
7188 do
7189 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007190#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007191 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007192 chunks. */
7193 if (len > INT_MAX/2) {
7194 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007195 done = 0;
7196 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007197 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007198#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007199 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007200 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007201 done = 1;
7202 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007203
Victor Stinner76a31a62011-11-04 00:05:13 +01007204 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007205 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007206 errors);
7207 if (ret == -2)
7208 ret = encode_code_page_errors(code_page, &outbytes,
7209 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007210 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007211 if (ret < 0) {
7212 Py_XDECREF(outbytes);
7213 return NULL;
7214 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007215
Victor Stinner7581cef2011-11-03 22:32:33 +01007216 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007217 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007218 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007219
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 return outbytes;
7221}
7222
7223PyObject *
7224PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7225 Py_ssize_t size,
7226 const char *errors)
7227{
Victor Stinner7581cef2011-11-03 22:32:33 +01007228 PyObject *unicode, *res;
7229 unicode = PyUnicode_FromUnicode(p, size);
7230 if (unicode == NULL)
7231 return NULL;
7232 res = encode_code_page(CP_ACP, unicode, errors);
7233 Py_DECREF(unicode);
7234 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007235}
7236
7237PyObject *
7238PyUnicode_EncodeCodePage(int code_page,
7239 PyObject *unicode,
7240 const char *errors)
7241{
Victor Stinner7581cef2011-11-03 22:32:33 +01007242 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007243}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007244
Alexander Belopolsky40018472011-02-26 01:02:56 +00007245PyObject *
7246PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007247{
7248 if (!PyUnicode_Check(unicode)) {
7249 PyErr_BadArgument();
7250 return NULL;
7251 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007252 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007253}
7254
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255#undef NEED_RETRY
7256
Victor Stinner99b95382011-07-04 14:23:54 +02007257#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007258
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259/* --- Character Mapping Codec -------------------------------------------- */
7260
Alexander Belopolsky40018472011-02-26 01:02:56 +00007261PyObject *
7262PyUnicode_DecodeCharmap(const char *s,
7263 Py_ssize_t size,
7264 PyObject *mapping,
7265 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007267 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007268 Py_ssize_t startinpos;
7269 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007270 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007271 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007272 PyObject *errorHandler = NULL;
7273 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007274
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 /* Default to Latin-1 */
7276 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007280 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007281 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007282 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007285 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007286 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007287 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007288 enum PyUnicode_Kind mapkind;
7289 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007290 Py_UCS4 x;
Victor Stinner03c3e352013-04-09 21:53:09 +02007291 unsigned char ch;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007292
Benjamin Petersonbac79492012-01-14 13:34:47 -05007293 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007294 return NULL;
7295
7296 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007297 mapdata = PyUnicode_DATA(mapping);
7298 mapkind = PyUnicode_KIND(mapping);
Victor Stinner03c3e352013-04-09 21:53:09 +02007299
7300 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7301 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7302 * is disabled in encoding aliases, latin1 is preferred because
7303 * its implementation is faster. */
7304 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7305 Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
7306 Py_UCS4 maxchar = writer.maxchar;
7307
7308 assert (writer.kind == PyUnicode_1BYTE_KIND);
7309 while (s < e) {
7310 ch = *s;
7311 x = mapdata_ucs1[ch];
7312 if (x > maxchar) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007313 if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1)
Victor Stinner03c3e352013-04-09 21:53:09 +02007314 goto onError;
7315 maxchar = writer.maxchar;
7316 outdata = (Py_UCS1 *)writer.data;
7317 }
7318 outdata[writer.pos] = x;
7319 writer.pos++;
7320 ++s;
7321 }
7322 }
7323
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007325 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007326 enum PyUnicode_Kind outkind = writer.kind;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007327 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007328 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007329 Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007330 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007331 while (s < e) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007332 ch = *s;
7333 x = mapdata_ucs2[ch];
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007334 if (x > maxchar)
7335 goto Error;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007336 outdata[writer.pos] = x;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007337 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007338 ++s;
7339 }
7340 break;
7341 }
7342 else if (outkind == PyUnicode_2BYTE_KIND) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007343 Py_UCS2 *outdata = (Py_UCS2 *)writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007344 while (s < e) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007345 ch = *s;
7346 x = mapdata_ucs2[ch];
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007347 if (x == 0xFFFE)
7348 goto Error;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007349 outdata[writer.pos] = x;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007350 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007351 ++s;
7352 }
7353 break;
7354 }
7355 }
7356 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007359 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007360 else
7361 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007362Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007363 if (x == 0xfffe)
7364 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 startinpos = s-starts;
7367 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007368 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 errors, &errorHandler,
7370 "charmap", "character maps to <undefined>",
7371 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007372 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 goto onError;
7374 }
7375 continue;
7376 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007377
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02007378 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007379 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007381 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007382 }
7383 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 while (s < e) {
7385 unsigned char ch = *s;
7386 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007387
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7389 w = PyLong_FromLong((long)ch);
7390 if (w == NULL)
7391 goto onError;
7392 x = PyObject_GetItem(mapping, w);
7393 Py_DECREF(w);
7394 if (x == NULL) {
7395 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7396 /* No mapping found means: mapping is undefined. */
7397 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007398 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 } else
7400 goto onError;
7401 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007402
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007404 if (x == Py_None)
7405 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 if (PyLong_Check(x)) {
7407 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007408 if (value == 0xFFFE)
7409 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007410 if (value < 0 || value > MAX_UNICODE) {
7411 PyErr_Format(PyExc_TypeError,
7412 "character mapping must be in range(0x%lx)",
7413 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 Py_DECREF(x);
7415 goto onError;
7416 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007417
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02007418 if (_PyUnicodeWriter_WriteCharInline(&writer, value) < 0) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007419 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007420 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007421 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 else if (PyUnicode_Check(x)) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007424 if (PyUnicode_READY(x) == -1) {
7425 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007426 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007427 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007428 if (PyUnicode_GET_LENGTH(x) == 1) {
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007429 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007430 if (value == 0xFFFE)
7431 goto Undefined;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02007432 if (_PyUnicodeWriter_WriteCharInline(&writer, value) < 0) {
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007433 Py_DECREF(x);
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007434 goto onError;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007435 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007436 }
7437 else {
7438 writer.overallocate = 1;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007439 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007440 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007441 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007442 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007443 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 }
7445 else {
7446 /* wrong return value */
7447 PyErr_SetString(PyExc_TypeError,
7448 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007449 Py_DECREF(x);
7450 goto onError;
7451 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 Py_DECREF(x);
7453 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007454 continue;
7455Undefined:
7456 /* undefined mapping */
7457 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007458 startinpos = s-starts;
7459 endinpos = startinpos+1;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007460 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007461 errors, &errorHandler,
7462 "charmap", "character maps to <undefined>",
7463 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007464 &writer)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007465 goto onError;
7466 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007469 Py_XDECREF(errorHandler);
7470 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007471 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007472
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007474 Py_XDECREF(errorHandler);
7475 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007476 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477 return NULL;
7478}
7479
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007480/* Charmap encoding: the lookup table */
7481
Alexander Belopolsky40018472011-02-26 01:02:56 +00007482struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 PyObject_HEAD
7484 unsigned char level1[32];
7485 int count2, count3;
7486 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007487};
7488
7489static PyObject*
7490encoding_map_size(PyObject *obj, PyObject* args)
7491{
7492 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007493 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007495}
7496
7497static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007498 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 PyDoc_STR("Return the size (in bytes) of this object") },
7500 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007501};
7502
7503static void
7504encoding_map_dealloc(PyObject* o)
7505{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007506 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007507}
7508
7509static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007510 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 "EncodingMap", /*tp_name*/
7512 sizeof(struct encoding_map), /*tp_basicsize*/
7513 0, /*tp_itemsize*/
7514 /* methods */
7515 encoding_map_dealloc, /*tp_dealloc*/
7516 0, /*tp_print*/
7517 0, /*tp_getattr*/
7518 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007519 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 0, /*tp_repr*/
7521 0, /*tp_as_number*/
7522 0, /*tp_as_sequence*/
7523 0, /*tp_as_mapping*/
7524 0, /*tp_hash*/
7525 0, /*tp_call*/
7526 0, /*tp_str*/
7527 0, /*tp_getattro*/
7528 0, /*tp_setattro*/
7529 0, /*tp_as_buffer*/
7530 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7531 0, /*tp_doc*/
7532 0, /*tp_traverse*/
7533 0, /*tp_clear*/
7534 0, /*tp_richcompare*/
7535 0, /*tp_weaklistoffset*/
7536 0, /*tp_iter*/
7537 0, /*tp_iternext*/
7538 encoding_map_methods, /*tp_methods*/
7539 0, /*tp_members*/
7540 0, /*tp_getset*/
7541 0, /*tp_base*/
7542 0, /*tp_dict*/
7543 0, /*tp_descr_get*/
7544 0, /*tp_descr_set*/
7545 0, /*tp_dictoffset*/
7546 0, /*tp_init*/
7547 0, /*tp_alloc*/
7548 0, /*tp_new*/
7549 0, /*tp_free*/
7550 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007551};
7552
7553PyObject*
7554PyUnicode_BuildEncodingMap(PyObject* string)
7555{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007556 PyObject *result;
7557 struct encoding_map *mresult;
7558 int i;
7559 int need_dict = 0;
7560 unsigned char level1[32];
7561 unsigned char level2[512];
7562 unsigned char *mlevel1, *mlevel2, *mlevel3;
7563 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007564 int kind;
7565 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007566 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007568
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007569 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007570 PyErr_BadArgument();
7571 return NULL;
7572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007573 kind = PyUnicode_KIND(string);
7574 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007575 length = PyUnicode_GET_LENGTH(string);
7576 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007577 memset(level1, 0xFF, sizeof level1);
7578 memset(level2, 0xFF, sizeof level2);
7579
7580 /* If there isn't a one-to-one mapping of NULL to \0,
7581 or if there are non-BMP characters, we need to use
7582 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007583 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007584 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007585 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007586 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007587 ch = PyUnicode_READ(kind, data, i);
7588 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007589 need_dict = 1;
7590 break;
7591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007592 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007593 /* unmapped character */
7594 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007595 l1 = ch >> 11;
7596 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007597 if (level1[l1] == 0xFF)
7598 level1[l1] = count2++;
7599 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007600 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007601 }
7602
7603 if (count2 >= 0xFF || count3 >= 0xFF)
7604 need_dict = 1;
7605
7606 if (need_dict) {
7607 PyObject *result = PyDict_New();
7608 PyObject *key, *value;
7609 if (!result)
7610 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007611 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007612 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007613 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007614 if (!key || !value)
7615 goto failed1;
7616 if (PyDict_SetItem(result, key, value) == -1)
7617 goto failed1;
7618 Py_DECREF(key);
7619 Py_DECREF(value);
7620 }
7621 return result;
7622 failed1:
7623 Py_XDECREF(key);
7624 Py_XDECREF(value);
7625 Py_DECREF(result);
7626 return NULL;
7627 }
7628
7629 /* Create a three-level trie */
7630 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7631 16*count2 + 128*count3 - 1);
7632 if (!result)
7633 return PyErr_NoMemory();
7634 PyObject_Init(result, &EncodingMapType);
7635 mresult = (struct encoding_map*)result;
7636 mresult->count2 = count2;
7637 mresult->count3 = count3;
7638 mlevel1 = mresult->level1;
7639 mlevel2 = mresult->level23;
7640 mlevel3 = mresult->level23 + 16*count2;
7641 memcpy(mlevel1, level1, 32);
7642 memset(mlevel2, 0xFF, 16*count2);
7643 memset(mlevel3, 0, 128*count3);
7644 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007645 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007646 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007647 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7648 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007649 /* unmapped character */
7650 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007651 o1 = ch>>11;
7652 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007653 i2 = 16*mlevel1[o1] + o2;
7654 if (mlevel2[i2] == 0xFF)
7655 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007656 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007657 i3 = 128*mlevel2[i2] + o3;
7658 mlevel3[i3] = i;
7659 }
7660 return result;
7661}
7662
7663static int
Victor Stinner22168992011-11-20 17:09:18 +01007664encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007665{
7666 struct encoding_map *map = (struct encoding_map*)mapping;
7667 int l1 = c>>11;
7668 int l2 = (c>>7) & 0xF;
7669 int l3 = c & 0x7F;
7670 int i;
7671
Victor Stinner22168992011-11-20 17:09:18 +01007672 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007674 if (c == 0)
7675 return 0;
7676 /* level 1*/
7677 i = map->level1[l1];
7678 if (i == 0xFF) {
7679 return -1;
7680 }
7681 /* level 2*/
7682 i = map->level23[16*i+l2];
7683 if (i == 0xFF) {
7684 return -1;
7685 }
7686 /* level 3 */
7687 i = map->level23[16*map->count2 + 128*i + l3];
7688 if (i == 0) {
7689 return -1;
7690 }
7691 return i;
7692}
7693
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694/* Lookup the character ch in the mapping. If the character
7695 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007696 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007697static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007698charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699{
Christian Heimes217cfd12007-12-02 14:31:20 +00007700 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007701 PyObject *x;
7702
7703 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 x = PyObject_GetItem(mapping, w);
7706 Py_DECREF(w);
7707 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7709 /* No mapping found means: mapping is undefined. */
7710 PyErr_Clear();
7711 x = Py_None;
7712 Py_INCREF(x);
7713 return x;
7714 } else
7715 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007717 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007719 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 long value = PyLong_AS_LONG(x);
7721 if (value < 0 || value > 255) {
7722 PyErr_SetString(PyExc_TypeError,
7723 "character mapping must be in range(256)");
7724 Py_DECREF(x);
7725 return NULL;
7726 }
7727 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007729 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 /* wrong return value */
7733 PyErr_Format(PyExc_TypeError,
7734 "character mapping must return integer, bytes or None, not %.400s",
7735 x->ob_type->tp_name);
7736 Py_DECREF(x);
7737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 }
7739}
7740
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007741static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007742charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7745 /* exponentially overallocate to minimize reallocations */
7746 if (requiredsize < 2*outsize)
7747 requiredsize = 2*outsize;
7748 if (_PyBytes_Resize(outobj, requiredsize))
7749 return -1;
7750 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007751}
7752
Benjamin Peterson14339b62009-01-31 16:36:08 +00007753typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007755} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007756/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007757 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007758 space is available. Return a new reference to the object that
7759 was put in the output buffer, or Py_None, if the mapping was undefined
7760 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007761 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007762static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007763charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007764 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007765{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766 PyObject *rep;
7767 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007768 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007769
Christian Heimes90aa7642007-12-19 02:45:37 +00007770 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007771 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 if (res == -1)
7774 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 if (outsize<requiredsize)
7776 if (charmapencode_resize(outobj, outpos, requiredsize))
7777 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007778 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 outstart[(*outpos)++] = (char)res;
7780 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007781 }
7782
7783 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007784 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007786 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 Py_DECREF(rep);
7788 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007789 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 if (PyLong_Check(rep)) {
7791 Py_ssize_t requiredsize = *outpos+1;
7792 if (outsize<requiredsize)
7793 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7794 Py_DECREF(rep);
7795 return enc_EXCEPTION;
7796 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007797 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007799 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 else {
7801 const char *repchars = PyBytes_AS_STRING(rep);
7802 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7803 Py_ssize_t requiredsize = *outpos+repsize;
7804 if (outsize<requiredsize)
7805 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7806 Py_DECREF(rep);
7807 return enc_EXCEPTION;
7808 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007809 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 memcpy(outstart + *outpos, repchars, repsize);
7811 *outpos += repsize;
7812 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007813 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007814 Py_DECREF(rep);
7815 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816}
7817
7818/* handle an error in PyUnicode_EncodeCharmap
7819 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007820static int
7821charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007822 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007824 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007825 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007826{
7827 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007828 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007829 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007830 enum PyUnicode_Kind kind;
7831 void *data;
7832 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007833 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007834 Py_ssize_t collstartpos = *inpos;
7835 Py_ssize_t collendpos = *inpos+1;
7836 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007837 char *encoding = "charmap";
7838 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007839 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007840 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007841 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007842
Benjamin Petersonbac79492012-01-14 13:34:47 -05007843 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007844 return -1;
7845 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007846 /* find all unencodable characters */
7847 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007849 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007850 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007851 val = encoding_map_lookup(ch, mapping);
7852 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 break;
7854 ++collendpos;
7855 continue;
7856 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007858 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7859 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 if (rep==NULL)
7861 return -1;
7862 else if (rep!=Py_None) {
7863 Py_DECREF(rep);
7864 break;
7865 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007866 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868 }
7869 /* cache callback name lookup
7870 * (if not done yet, i.e. it's the first error) */
7871 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 if ((errors==NULL) || (!strcmp(errors, "strict")))
7873 *known_errorHandler = 1;
7874 else if (!strcmp(errors, "replace"))
7875 *known_errorHandler = 2;
7876 else if (!strcmp(errors, "ignore"))
7877 *known_errorHandler = 3;
7878 else if (!strcmp(errors, "xmlcharrefreplace"))
7879 *known_errorHandler = 4;
7880 else
7881 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007882 }
7883 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007884 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007885 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007886 return -1;
7887 case 2: /* replace */
7888 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 x = charmapencode_output('?', mapping, res, respos);
7890 if (x==enc_EXCEPTION) {
7891 return -1;
7892 }
7893 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007894 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 return -1;
7896 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007897 }
7898 /* fall through */
7899 case 3: /* ignore */
7900 *inpos = collendpos;
7901 break;
7902 case 4: /* xmlcharrefreplace */
7903 /* generate replacement (temporarily (mis)uses p) */
7904 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 char buffer[2+29+1+1];
7906 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007907 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 for (cp = buffer; *cp; ++cp) {
7909 x = charmapencode_output(*cp, mapping, res, respos);
7910 if (x==enc_EXCEPTION)
7911 return -1;
7912 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007913 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 return -1;
7915 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007916 }
7917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 *inpos = collendpos;
7919 break;
7920 default:
7921 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007922 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007924 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007926 if (PyBytes_Check(repunicode)) {
7927 /* Directly copy bytes result to output. */
7928 Py_ssize_t outsize = PyBytes_Size(*res);
7929 Py_ssize_t requiredsize;
7930 repsize = PyBytes_Size(repunicode);
7931 requiredsize = *respos + repsize;
7932 if (requiredsize > outsize)
7933 /* Make room for all additional bytes. */
7934 if (charmapencode_resize(res, respos, requiredsize)) {
7935 Py_DECREF(repunicode);
7936 return -1;
7937 }
7938 memcpy(PyBytes_AsString(*res) + *respos,
7939 PyBytes_AsString(repunicode), repsize);
7940 *respos += repsize;
7941 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007942 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007943 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007944 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007945 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007946 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007947 Py_DECREF(repunicode);
7948 return -1;
7949 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007950 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007951 data = PyUnicode_DATA(repunicode);
7952 kind = PyUnicode_KIND(repunicode);
7953 for (index = 0; index < repsize; index++) {
7954 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7955 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007957 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 return -1;
7959 }
7960 else if (x==enc_FAILED) {
7961 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007962 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 return -1;
7964 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007965 }
7966 *inpos = newpos;
7967 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007968 }
7969 return 0;
7970}
7971
Alexander Belopolsky40018472011-02-26 01:02:56 +00007972PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007973_PyUnicode_EncodeCharmap(PyObject *unicode,
7974 PyObject *mapping,
7975 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007977 /* output object */
7978 PyObject *res = NULL;
7979 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007980 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007981 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007982 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007983 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984 PyObject *errorHandler = NULL;
7985 PyObject *exc = NULL;
7986 /* the following variable is used for caching string comparisons
7987 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7988 * 3=ignore, 4=xmlcharrefreplace */
7989 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02007990 void *data;
7991 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992
Benjamin Petersonbac79492012-01-14 13:34:47 -05007993 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007994 return NULL;
7995 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02007996 data = PyUnicode_DATA(unicode);
7997 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007998
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 /* Default to Latin-1 */
8000 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008001 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008003 /* allocate enough for a simple encoding without
8004 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008005 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008006 if (res == NULL)
8007 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008008 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008012 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008014 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 if (x==enc_EXCEPTION) /* error */
8016 goto onError;
8017 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008018 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 &exc,
8020 &known_errorHandler, &errorHandler, errors,
8021 &res, &respos)) {
8022 goto onError;
8023 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008024 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 else
8026 /* done with this character => adjust input position */
8027 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008031 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008032 if (_PyBytes_Resize(&res, respos) < 0)
8033 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008034
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008035 Py_XDECREF(exc);
8036 Py_XDECREF(errorHandler);
8037 return res;
8038
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008040 Py_XDECREF(res);
8041 Py_XDECREF(exc);
8042 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 return NULL;
8044}
8045
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008046/* Deprecated */
8047PyObject *
8048PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8049 Py_ssize_t size,
8050 PyObject *mapping,
8051 const char *errors)
8052{
8053 PyObject *result;
8054 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8055 if (unicode == NULL)
8056 return NULL;
8057 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8058 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008059 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008060}
8061
Alexander Belopolsky40018472011-02-26 01:02:56 +00008062PyObject *
8063PyUnicode_AsCharmapString(PyObject *unicode,
8064 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065{
8066 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 PyErr_BadArgument();
8068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008070 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071}
8072
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008073/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008074static void
8075make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008076 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008077 Py_ssize_t startpos, Py_ssize_t endpos,
8078 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081 *exceptionObject = _PyUnicodeTranslateError_Create(
8082 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 }
8084 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8086 goto onError;
8087 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8088 goto onError;
8089 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8090 goto onError;
8091 return;
8092 onError:
8093 Py_DECREF(*exceptionObject);
8094 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 }
8096}
8097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098/* error handling callback helper:
8099 build arguments, call the callback and check the arguments,
8100 put the result into newpos and return the replacement string, which
8101 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008102static PyObject *
8103unicode_translate_call_errorhandler(const char *errors,
8104 PyObject **errorHandler,
8105 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008106 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008107 Py_ssize_t startpos, Py_ssize_t endpos,
8108 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008110 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008112 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113 PyObject *restuple;
8114 PyObject *resunicode;
8115
8116 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120 }
8121
8122 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008124 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008126
8127 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008132 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 Py_DECREF(restuple);
8134 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008135 }
8136 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 &resunicode, &i_newpos)) {
8138 Py_DECREF(restuple);
8139 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008141 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008142 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143 else
8144 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008145 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8147 Py_DECREF(restuple);
8148 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150 Py_INCREF(resunicode);
8151 Py_DECREF(restuple);
8152 return resunicode;
8153}
8154
8155/* Lookup the character ch in the mapping and put the result in result,
8156 which must be decrefed by the caller.
8157 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008158static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008160{
Christian Heimes217cfd12007-12-02 14:31:20 +00008161 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 PyObject *x;
8163
8164 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008166 x = PyObject_GetItem(mapping, w);
8167 Py_DECREF(w);
8168 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8170 /* No mapping found means: use 1:1 mapping. */
8171 PyErr_Clear();
8172 *result = NULL;
8173 return 0;
8174 } else
8175 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008176 }
8177 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 *result = x;
8179 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008180 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008181 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 long value = PyLong_AS_LONG(x);
8183 long max = PyUnicode_GetMax();
8184 if (value < 0 || value > max) {
8185 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008186 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 Py_DECREF(x);
8188 return -1;
8189 }
8190 *result = x;
8191 return 0;
8192 }
8193 else if (PyUnicode_Check(x)) {
8194 *result = x;
8195 return 0;
8196 }
8197 else {
8198 /* wrong return value */
8199 PyErr_SetString(PyExc_TypeError,
8200 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 Py_DECREF(x);
8202 return -1;
8203 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008204}
8205/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 if not reallocate and adjust various state variables.
8207 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008208static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008209charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008213 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008214 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 /* exponentially overallocate to minimize reallocations */
8216 if (requiredsize < 2 * oldsize)
8217 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008218 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8219 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008221 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 }
8224 return 0;
8225}
8226/* lookup the character, put the result in the output string and adjust
8227 various state variables. Return a new reference to the object that
8228 was put in the output buffer in *result, or Py_None, if the mapping was
8229 undefined (in which case no character was written).
8230 The called must decref result.
8231 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008232static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8234 PyObject *mapping, Py_UCS4 **output,
8235 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008236 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8239 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 }
8245 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008247 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250 }
8251 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 Py_ssize_t repsize;
8253 if (PyUnicode_READY(*res) == -1)
8254 return -1;
8255 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 if (repsize==1) {
8257 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 }
8260 else if (repsize!=0) {
8261 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262 Py_ssize_t requiredsize = *opos +
8263 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 Py_ssize_t i;
8266 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268 for(i = 0; i < repsize; i++)
8269 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 }
8272 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 return 0;
8275}
8276
Alexander Belopolsky40018472011-02-26 01:02:56 +00008277PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278_PyUnicode_TranslateCharmap(PyObject *input,
8279 PyObject *mapping,
8280 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008282 /* input object */
8283 char *idata;
8284 Py_ssize_t size, i;
8285 int kind;
8286 /* output buffer */
8287 Py_UCS4 *output = NULL;
8288 Py_ssize_t osize;
8289 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008291 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 char *reason = "character maps to <undefined>";
8293 PyObject *errorHandler = NULL;
8294 PyObject *exc = NULL;
8295 /* the following variable is used for caching string comparisons
8296 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8297 * 3=ignore, 4=xmlcharrefreplace */
8298 int known_errorHandler = -1;
8299
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 PyErr_BadArgument();
8302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 if (PyUnicode_READY(input) == -1)
8306 return NULL;
8307 idata = (char*)PyUnicode_DATA(input);
8308 kind = PyUnicode_KIND(input);
8309 size = PyUnicode_GET_LENGTH(input);
8310 i = 0;
8311
8312 if (size == 0) {
8313 Py_INCREF(input);
8314 return input;
8315 }
8316
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317 /* allocate enough for a simple 1:1 translation without
8318 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 osize = size;
8320 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8321 opos = 0;
8322 if (output == NULL) {
8323 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 /* try to encode it */
8329 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330 if (charmaptranslate_output(input, i, mapping,
8331 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 Py_XDECREF(x);
8333 goto onError;
8334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008335 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 else { /* untranslatable character */
8339 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8340 Py_ssize_t repsize;
8341 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 Py_ssize_t collstart = i;
8345 Py_ssize_t collend = i+1;
8346 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 while (collend < size) {
8350 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 goto onError;
8352 Py_XDECREF(x);
8353 if (x!=Py_None)
8354 break;
8355 ++collend;
8356 }
8357 /* cache callback name lookup
8358 * (if not done yet, i.e. it's the first error) */
8359 if (known_errorHandler==-1) {
8360 if ((errors==NULL) || (!strcmp(errors, "strict")))
8361 known_errorHandler = 1;
8362 else if (!strcmp(errors, "replace"))
8363 known_errorHandler = 2;
8364 else if (!strcmp(errors, "ignore"))
8365 known_errorHandler = 3;
8366 else if (!strcmp(errors, "xmlcharrefreplace"))
8367 known_errorHandler = 4;
8368 else
8369 known_errorHandler = 0;
8370 }
8371 switch (known_errorHandler) {
8372 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008373 make_translate_exception(&exc,
8374 input, collstart, collend, reason);
8375 if (exc != NULL)
8376 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008377 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 case 2: /* replace */
8379 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 for (coll = collstart; coll<collend; coll++)
8381 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 /* fall through */
8383 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 break;
8386 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 /* generate replacement (temporarily (mis)uses i) */
8388 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 char buffer[2+29+1+1];
8390 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8392 if (charmaptranslate_makespace(&output, &osize,
8393 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 goto onError;
8395 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 break;
8400 default:
8401 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 reason, input, &exc,
8403 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008404 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008406 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008407 Py_DECREF(repunicode);
8408 goto onError;
8409 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 repsize = PyUnicode_GET_LENGTH(repunicode);
8412 if (charmaptranslate_makespace(&output, &osize,
8413 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 Py_DECREF(repunicode);
8415 goto onError;
8416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 for (uni2 = 0; repsize-->0; ++uni2)
8418 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8419 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008422 }
8423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8425 if (!res)
8426 goto onError;
8427 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 Py_XDECREF(exc);
8429 Py_XDECREF(errorHandler);
8430 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 Py_XDECREF(exc);
8435 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 return NULL;
8437}
8438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439/* Deprecated. Use PyUnicode_Translate instead. */
8440PyObject *
8441PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8442 Py_ssize_t size,
8443 PyObject *mapping,
8444 const char *errors)
8445{
Christian Heimes5f520f42012-09-11 14:03:25 +02008446 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8448 if (!unicode)
8449 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008450 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8451 Py_DECREF(unicode);
8452 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453}
8454
Alexander Belopolsky40018472011-02-26 01:02:56 +00008455PyObject *
8456PyUnicode_Translate(PyObject *str,
8457 PyObject *mapping,
8458 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459{
8460 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008461
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 str = PyUnicode_FromObject(str);
8463 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008464 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 Py_DECREF(str);
8467 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468}
Tim Petersced69f82003-09-16 20:30:58 +00008469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008471fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472{
8473 /* No need to call PyUnicode_READY(self) because this function is only
8474 called as a callback from fixup() which does it already. */
8475 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8476 const int kind = PyUnicode_KIND(self);
8477 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008478 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008479 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 Py_ssize_t i;
8481
8482 for (i = 0; i < len; ++i) {
8483 ch = PyUnicode_READ(kind, data, i);
8484 fixed = 0;
8485 if (ch > 127) {
8486 if (Py_UNICODE_ISSPACE(ch))
8487 fixed = ' ';
8488 else {
8489 const int decimal = Py_UNICODE_TODECIMAL(ch);
8490 if (decimal >= 0)
8491 fixed = '0' + decimal;
8492 }
8493 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008494 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008495 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 PyUnicode_WRITE(kind, data, i, fixed);
8497 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008498 else
8499 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 }
8502
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008503 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504}
8505
8506PyObject *
8507_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8508{
8509 if (!PyUnicode_Check(unicode)) {
8510 PyErr_BadInternalCall();
8511 return NULL;
8512 }
8513 if (PyUnicode_READY(unicode) == -1)
8514 return NULL;
8515 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8516 /* If the string is already ASCII, just return the same string */
8517 Py_INCREF(unicode);
8518 return unicode;
8519 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008520 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521}
8522
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008523PyObject *
8524PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8525 Py_ssize_t length)
8526{
Victor Stinnerf0124502011-11-21 23:12:56 +01008527 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008528 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008529 Py_UCS4 maxchar;
8530 enum PyUnicode_Kind kind;
8531 void *data;
8532
Victor Stinner99d7ad02012-02-22 13:37:39 +01008533 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008534 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008535 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008536 if (ch > 127) {
8537 int decimal = Py_UNICODE_TODECIMAL(ch);
8538 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008539 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008540 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008541 }
8542 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008543
8544 /* Copy to a new string */
8545 decimal = PyUnicode_New(length, maxchar);
8546 if (decimal == NULL)
8547 return decimal;
8548 kind = PyUnicode_KIND(decimal);
8549 data = PyUnicode_DATA(decimal);
8550 /* Iterate over code points */
8551 for (i = 0; i < length; i++) {
8552 Py_UNICODE ch = s[i];
8553 if (ch > 127) {
8554 int decimal = Py_UNICODE_TODECIMAL(ch);
8555 if (decimal >= 0)
8556 ch = '0' + decimal;
8557 }
8558 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008560 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008561}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008562/* --- Decimal Encoder ---------------------------------------------------- */
8563
Alexander Belopolsky40018472011-02-26 01:02:56 +00008564int
8565PyUnicode_EncodeDecimal(Py_UNICODE *s,
8566 Py_ssize_t length,
8567 char *output,
8568 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008569{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008570 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008571 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008572 enum PyUnicode_Kind kind;
8573 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008574
8575 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 PyErr_BadArgument();
8577 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008578 }
8579
Victor Stinner42bf7752011-11-21 22:52:58 +01008580 unicode = PyUnicode_FromUnicode(s, length);
8581 if (unicode == NULL)
8582 return -1;
8583
Benjamin Petersonbac79492012-01-14 13:34:47 -05008584 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008585 Py_DECREF(unicode);
8586 return -1;
8587 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008588 kind = PyUnicode_KIND(unicode);
8589 data = PyUnicode_DATA(unicode);
8590
Victor Stinnerb84d7232011-11-22 01:50:07 +01008591 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008592 PyObject *exc;
8593 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008595 Py_ssize_t startpos;
8596
8597 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008598
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008600 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008601 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008603 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 decimal = Py_UNICODE_TODECIMAL(ch);
8605 if (decimal >= 0) {
8606 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008607 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 continue;
8609 }
8610 if (0 < ch && ch < 256) {
8611 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008612 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 continue;
8614 }
Victor Stinner6345be92011-11-25 20:09:01 +01008615
Victor Stinner42bf7752011-11-21 22:52:58 +01008616 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008617 exc = NULL;
8618 raise_encode_exception(&exc, "decimal", unicode,
8619 startpos, startpos+1,
8620 "invalid decimal Unicode string");
8621 Py_XDECREF(exc);
8622 Py_DECREF(unicode);
8623 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008624 }
8625 /* 0-terminate the output string */
8626 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008627 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008628 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008629}
8630
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631/* --- Helpers ------------------------------------------------------------ */
8632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008634any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 Py_ssize_t start,
8636 Py_ssize_t end)
8637{
8638 int kind1, kind2, kind;
8639 void *buf1, *buf2;
8640 Py_ssize_t len1, len2, result;
8641
8642 kind1 = PyUnicode_KIND(s1);
8643 kind2 = PyUnicode_KIND(s2);
8644 kind = kind1 > kind2 ? kind1 : kind2;
8645 buf1 = PyUnicode_DATA(s1);
8646 buf2 = PyUnicode_DATA(s2);
8647 if (kind1 != kind)
8648 buf1 = _PyUnicode_AsKind(s1, kind);
8649 if (!buf1)
8650 return -2;
8651 if (kind2 != kind)
8652 buf2 = _PyUnicode_AsKind(s2, kind);
8653 if (!buf2) {
8654 if (kind1 != kind) PyMem_Free(buf1);
8655 return -2;
8656 }
8657 len1 = PyUnicode_GET_LENGTH(s1);
8658 len2 = PyUnicode_GET_LENGTH(s2);
8659
Victor Stinner794d5672011-10-10 03:21:36 +02008660 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008661 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008662 case PyUnicode_1BYTE_KIND:
8663 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8664 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8665 else
8666 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8667 break;
8668 case PyUnicode_2BYTE_KIND:
8669 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8670 break;
8671 case PyUnicode_4BYTE_KIND:
8672 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8673 break;
8674 default:
8675 assert(0); result = -2;
8676 }
8677 }
8678 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008679 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008680 case PyUnicode_1BYTE_KIND:
8681 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8682 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8683 else
8684 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8685 break;
8686 case PyUnicode_2BYTE_KIND:
8687 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8688 break;
8689 case PyUnicode_4BYTE_KIND:
8690 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8691 break;
8692 default:
8693 assert(0); result = -2;
8694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 }
8696
8697 if (kind1 != kind)
8698 PyMem_Free(buf1);
8699 if (kind2 != kind)
8700 PyMem_Free(buf2);
8701
8702 return result;
8703}
8704
8705Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008706_PyUnicode_InsertThousandsGrouping(
8707 PyObject *unicode, Py_ssize_t index,
8708 Py_ssize_t n_buffer,
8709 void *digits, Py_ssize_t n_digits,
8710 Py_ssize_t min_width,
8711 const char *grouping, PyObject *thousands_sep,
8712 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713{
Victor Stinner41a863c2012-02-24 00:37:51 +01008714 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008715 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008716 Py_ssize_t thousands_sep_len;
8717 Py_ssize_t len;
8718
8719 if (unicode != NULL) {
8720 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008721 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008722 }
8723 else {
8724 kind = PyUnicode_1BYTE_KIND;
8725 data = NULL;
8726 }
8727 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8728 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8729 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8730 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008731 if (thousands_sep_kind < kind) {
8732 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8733 if (!thousands_sep_data)
8734 return -1;
8735 }
8736 else {
8737 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8738 if (!data)
8739 return -1;
8740 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008741 }
8742
Benjamin Petersonead6b532011-12-20 17:23:42 -06008743 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008745 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008746 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008747 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008748 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008749 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008750 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008751 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008752 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008753 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008754 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008755 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008757 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008758 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008759 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008760 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008761 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008763 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008764 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008765 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008766 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008767 break;
8768 default:
8769 assert(0);
8770 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008772 if (unicode != NULL && thousands_sep_kind != kind) {
8773 if (thousands_sep_kind < kind)
8774 PyMem_Free(thousands_sep_data);
8775 else
8776 PyMem_Free(data);
8777 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008778 if (unicode == NULL) {
8779 *maxchar = 127;
8780 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008781 *maxchar = MAX_MAXCHAR(*maxchar,
8782 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008783 }
8784 }
8785 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786}
8787
8788
Thomas Wouters477c8d52006-05-27 19:21:47 +00008789/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008790#define ADJUST_INDICES(start, end, len) \
8791 if (end > len) \
8792 end = len; \
8793 else if (end < 0) { \
8794 end += len; \
8795 if (end < 0) \
8796 end = 0; \
8797 } \
8798 if (start < 0) { \
8799 start += len; \
8800 if (start < 0) \
8801 start = 0; \
8802 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008803
Alexander Belopolsky40018472011-02-26 01:02:56 +00008804Py_ssize_t
8805PyUnicode_Count(PyObject *str,
8806 PyObject *substr,
8807 Py_ssize_t start,
8808 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008810 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008811 PyObject* str_obj;
8812 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 int kind1, kind2, kind;
8814 void *buf1 = NULL, *buf2 = NULL;
8815 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008816
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008817 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008818 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008820 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008821 if (!sub_obj) {
8822 Py_DECREF(str_obj);
8823 return -1;
8824 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008825 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008826 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 Py_DECREF(str_obj);
8828 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 }
Tim Petersced69f82003-09-16 20:30:58 +00008830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 kind1 = PyUnicode_KIND(str_obj);
8832 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008833 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008836 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008837 if (kind2 > kind) {
8838 Py_DECREF(sub_obj);
8839 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008840 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008841 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008842 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 if (!buf2)
8845 goto onError;
8846 len1 = PyUnicode_GET_LENGTH(str_obj);
8847 len2 = PyUnicode_GET_LENGTH(sub_obj);
8848
8849 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008850 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008852 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8853 result = asciilib_count(
8854 ((Py_UCS1*)buf1) + start, end - start,
8855 buf2, len2, PY_SSIZE_T_MAX
8856 );
8857 else
8858 result = ucs1lib_count(
8859 ((Py_UCS1*)buf1) + start, end - start,
8860 buf2, len2, PY_SSIZE_T_MAX
8861 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 break;
8863 case PyUnicode_2BYTE_KIND:
8864 result = ucs2lib_count(
8865 ((Py_UCS2*)buf1) + start, end - start,
8866 buf2, len2, PY_SSIZE_T_MAX
8867 );
8868 break;
8869 case PyUnicode_4BYTE_KIND:
8870 result = ucs4lib_count(
8871 ((Py_UCS4*)buf1) + start, end - start,
8872 buf2, len2, PY_SSIZE_T_MAX
8873 );
8874 break;
8875 default:
8876 assert(0); result = 0;
8877 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008878
8879 Py_DECREF(sub_obj);
8880 Py_DECREF(str_obj);
8881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 if (kind2 != kind)
8883 PyMem_Free(buf2);
8884
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 onError:
8887 Py_DECREF(sub_obj);
8888 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 if (kind2 != kind && buf2)
8890 PyMem_Free(buf2);
8891 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892}
8893
Alexander Belopolsky40018472011-02-26 01:02:56 +00008894Py_ssize_t
8895PyUnicode_Find(PyObject *str,
8896 PyObject *sub,
8897 Py_ssize_t start,
8898 Py_ssize_t end,
8899 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008901 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008902
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008904 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008905 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008906 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008907 if (!sub) {
8908 Py_DECREF(str);
8909 return -2;
8910 }
8911 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8912 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 Py_DECREF(str);
8914 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915 }
Tim Petersced69f82003-09-16 20:30:58 +00008916
Victor Stinner794d5672011-10-10 03:21:36 +02008917 result = any_find_slice(direction,
8918 str, sub, start, end
8919 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008920
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008922 Py_DECREF(sub);
8923
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 return result;
8925}
8926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927Py_ssize_t
8928PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8929 Py_ssize_t start, Py_ssize_t end,
8930 int direction)
8931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008933 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 if (PyUnicode_READY(str) == -1)
8935 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008936 if (start < 0 || end < 0) {
8937 PyErr_SetString(PyExc_IndexError, "string index out of range");
8938 return -2;
8939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 if (end > PyUnicode_GET_LENGTH(str))
8941 end = PyUnicode_GET_LENGTH(str);
8942 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008943 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8944 kind, end-start, ch, direction);
8945 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008947 else
8948 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949}
8950
Alexander Belopolsky40018472011-02-26 01:02:56 +00008951static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008952tailmatch(PyObject *self,
8953 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008954 Py_ssize_t start,
8955 Py_ssize_t end,
8956 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 int kind_self;
8959 int kind_sub;
8960 void *data_self;
8961 void *data_sub;
8962 Py_ssize_t offset;
8963 Py_ssize_t i;
8964 Py_ssize_t end_sub;
8965
8966 if (PyUnicode_READY(self) == -1 ||
8967 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01008968 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969
8970 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 return 1;
8972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8974 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 kind_self = PyUnicode_KIND(self);
8979 data_self = PyUnicode_DATA(self);
8980 kind_sub = PyUnicode_KIND(substring);
8981 data_sub = PyUnicode_DATA(substring);
8982 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8983
8984 if (direction > 0)
8985 offset = end;
8986 else
8987 offset = start;
8988
8989 if (PyUnicode_READ(kind_self, data_self, offset) ==
8990 PyUnicode_READ(kind_sub, data_sub, 0) &&
8991 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8992 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8993 /* If both are of the same kind, memcmp is sufficient */
8994 if (kind_self == kind_sub) {
8995 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008996 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 data_sub,
8998 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008999 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 }
9001 /* otherwise we have to compare each character by first accesing it */
9002 else {
9003 /* We do not need to compare 0 and len(substring)-1 because
9004 the if statement above ensured already that they are equal
9005 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 for (i = 1; i < end_sub; ++i) {
9007 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9008 PyUnicode_READ(kind_sub, data_sub, i))
9009 return 0;
9010 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013 }
9014
9015 return 0;
9016}
9017
Alexander Belopolsky40018472011-02-26 01:02:56 +00009018Py_ssize_t
9019PyUnicode_Tailmatch(PyObject *str,
9020 PyObject *substr,
9021 Py_ssize_t start,
9022 Py_ssize_t end,
9023 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009025 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009026
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 str = PyUnicode_FromObject(str);
9028 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 substr = PyUnicode_FromObject(substr);
9031 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 Py_DECREF(str);
9033 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 }
Tim Petersced69f82003-09-16 20:30:58 +00009035
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009036 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 Py_DECREF(str);
9039 Py_DECREF(substr);
9040 return result;
9041}
9042
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043/* Apply fixfct filter to the Unicode object self and return a
9044 reference to the modified object */
9045
Alexander Belopolsky40018472011-02-26 01:02:56 +00009046static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009047fixup(PyObject *self,
9048 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 PyObject *u;
9051 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009052 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009054 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009057 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 /* fix functions return the new maximum character in a string,
9060 if the kind of the resulting unicode object does not change,
9061 everything is fine. Otherwise we need to change the string kind
9062 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009063 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009064
9065 if (maxchar_new == 0) {
9066 /* no changes */;
9067 if (PyUnicode_CheckExact(self)) {
9068 Py_DECREF(u);
9069 Py_INCREF(self);
9070 return self;
9071 }
9072 else
9073 return u;
9074 }
9075
Victor Stinnere6abb482012-05-02 01:15:40 +02009076 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077
Victor Stinnereaab6042011-12-11 22:22:39 +01009078 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009080
9081 /* In case the maximum character changed, we need to
9082 convert the string to the new category. */
9083 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9084 if (v == NULL) {
9085 Py_DECREF(u);
9086 return NULL;
9087 }
9088 if (maxchar_new > maxchar_old) {
9089 /* If the maxchar increased so that the kind changed, not all
9090 characters are representable anymore and we need to fix the
9091 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009092 _PyUnicode_FastCopyCharacters(v, 0,
9093 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009094 maxchar_old = fixfct(v);
9095 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 }
9097 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009098 _PyUnicode_FastCopyCharacters(v, 0,
9099 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009101 Py_DECREF(u);
9102 assert(_PyUnicode_CheckConsistency(v, 1));
9103 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104}
9105
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009106static PyObject *
9107ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009109 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9110 char *resdata, *data = PyUnicode_DATA(self);
9111 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009112
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009113 res = PyUnicode_New(len, 127);
9114 if (res == NULL)
9115 return NULL;
9116 resdata = PyUnicode_DATA(res);
9117 if (lower)
9118 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009120 _Py_bytes_upper(resdata, data, len);
9121 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122}
9123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009125handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009127 Py_ssize_t j;
9128 int final_sigma;
9129 Py_UCS4 c;
9130 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009131
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009132 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9133
9134 where ! is a negation and \p{xxx} is a character with property xxx.
9135 */
9136 for (j = i - 1; j >= 0; j--) {
9137 c = PyUnicode_READ(kind, data, j);
9138 if (!_PyUnicode_IsCaseIgnorable(c))
9139 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009141 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9142 if (final_sigma) {
9143 for (j = i + 1; j < length; j++) {
9144 c = PyUnicode_READ(kind, data, j);
9145 if (!_PyUnicode_IsCaseIgnorable(c))
9146 break;
9147 }
9148 final_sigma = j == length || !_PyUnicode_IsCased(c);
9149 }
9150 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151}
9152
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009153static int
9154lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9155 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009157 /* Obscure special case. */
9158 if (c == 0x3A3) {
9159 mapped[0] = handle_capital_sigma(kind, data, length, i);
9160 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009162 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163}
9164
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009165static Py_ssize_t
9166do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009168 Py_ssize_t i, k = 0;
9169 int n_res, j;
9170 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009171
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009172 c = PyUnicode_READ(kind, data, 0);
9173 n_res = _PyUnicode_ToUpperFull(c, mapped);
9174 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009175 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009176 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009178 for (i = 1; i < length; i++) {
9179 c = PyUnicode_READ(kind, data, i);
9180 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9181 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009182 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009183 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009184 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009185 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009186 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187}
9188
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009189static Py_ssize_t
9190do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9191 Py_ssize_t i, k = 0;
9192
9193 for (i = 0; i < length; i++) {
9194 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9195 int n_res, j;
9196 if (Py_UNICODE_ISUPPER(c)) {
9197 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9198 }
9199 else if (Py_UNICODE_ISLOWER(c)) {
9200 n_res = _PyUnicode_ToUpperFull(c, mapped);
9201 }
9202 else {
9203 n_res = 1;
9204 mapped[0] = c;
9205 }
9206 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009207 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009208 res[k++] = mapped[j];
9209 }
9210 }
9211 return k;
9212}
9213
9214static Py_ssize_t
9215do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9216 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009218 Py_ssize_t i, k = 0;
9219
9220 for (i = 0; i < length; i++) {
9221 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9222 int n_res, j;
9223 if (lower)
9224 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9225 else
9226 n_res = _PyUnicode_ToUpperFull(c, mapped);
9227 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009228 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009229 res[k++] = mapped[j];
9230 }
9231 }
9232 return k;
9233}
9234
9235static Py_ssize_t
9236do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9237{
9238 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9239}
9240
9241static Py_ssize_t
9242do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9243{
9244 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9245}
9246
Benjamin Petersone51757f2012-01-12 21:10:29 -05009247static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009248do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9249{
9250 Py_ssize_t i, k = 0;
9251
9252 for (i = 0; i < length; i++) {
9253 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9254 Py_UCS4 mapped[3];
9255 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9256 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009257 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009258 res[k++] = mapped[j];
9259 }
9260 }
9261 return k;
9262}
9263
9264static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009265do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9266{
9267 Py_ssize_t i, k = 0;
9268 int previous_is_cased;
9269
9270 previous_is_cased = 0;
9271 for (i = 0; i < length; i++) {
9272 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9273 Py_UCS4 mapped[3];
9274 int n_res, j;
9275
9276 if (previous_is_cased)
9277 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9278 else
9279 n_res = _PyUnicode_ToTitleFull(c, mapped);
9280
9281 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009282 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009283 res[k++] = mapped[j];
9284 }
9285
9286 previous_is_cased = _PyUnicode_IsCased(c);
9287 }
9288 return k;
9289}
9290
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009291static PyObject *
9292case_operation(PyObject *self,
9293 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9294{
9295 PyObject *res = NULL;
9296 Py_ssize_t length, newlength = 0;
9297 int kind, outkind;
9298 void *data, *outdata;
9299 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9300
Benjamin Petersoneea48462012-01-16 14:28:50 -05009301 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009302
9303 kind = PyUnicode_KIND(self);
9304 data = PyUnicode_DATA(self);
9305 length = PyUnicode_GET_LENGTH(self);
9306 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9307 if (tmp == NULL)
9308 return PyErr_NoMemory();
9309 newlength = perform(kind, data, length, tmp, &maxchar);
9310 res = PyUnicode_New(newlength, maxchar);
9311 if (res == NULL)
9312 goto leave;
9313 tmpend = tmp + newlength;
9314 outdata = PyUnicode_DATA(res);
9315 outkind = PyUnicode_KIND(res);
9316 switch (outkind) {
9317 case PyUnicode_1BYTE_KIND:
9318 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9319 break;
9320 case PyUnicode_2BYTE_KIND:
9321 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9322 break;
9323 case PyUnicode_4BYTE_KIND:
9324 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9325 break;
9326 default:
9327 assert(0);
9328 break;
9329 }
9330 leave:
9331 PyMem_FREE(tmp);
9332 return res;
9333}
9334
Tim Peters8ce9f162004-08-27 01:49:32 +00009335PyObject *
9336PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009339 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009341 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009342 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9343 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009344 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009346 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009348 int use_memcpy;
9349 unsigned char *res_data = NULL, *sep_data = NULL;
9350 PyObject *last_obj;
9351 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352
Tim Peters05eba1f2004-08-27 21:32:02 +00009353 fseq = PySequence_Fast(seq, "");
9354 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009355 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009356 }
9357
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009358 /* NOTE: the following code can't call back into Python code,
9359 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009360 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009361
Tim Peters05eba1f2004-08-27 21:32:02 +00009362 seqlen = PySequence_Fast_GET_SIZE(fseq);
9363 /* If empty sequence, return u"". */
9364 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009365 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009366 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009367 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009368
Tim Peters05eba1f2004-08-27 21:32:02 +00009369 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009370 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009371 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009372 if (seqlen == 1) {
9373 if (PyUnicode_CheckExact(items[0])) {
9374 res = items[0];
9375 Py_INCREF(res);
9376 Py_DECREF(fseq);
9377 return res;
9378 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009379 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009380 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009381 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009382 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009383 /* Set up sep and seplen */
9384 if (separator == NULL) {
9385 /* fall back to a blank space separator */
9386 sep = PyUnicode_FromOrdinal(' ');
9387 if (!sep)
9388 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009389 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009390 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009391 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009392 else {
9393 if (!PyUnicode_Check(separator)) {
9394 PyErr_Format(PyExc_TypeError,
9395 "separator: expected str instance,"
9396 " %.80s found",
9397 Py_TYPE(separator)->tp_name);
9398 goto onError;
9399 }
9400 if (PyUnicode_READY(separator))
9401 goto onError;
9402 sep = separator;
9403 seplen = PyUnicode_GET_LENGTH(separator);
9404 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9405 /* inc refcount to keep this code path symmetric with the
9406 above case of a blank separator */
9407 Py_INCREF(sep);
9408 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009409 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009410 }
9411
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009412 /* There are at least two things to join, or else we have a subclass
9413 * of str in the sequence.
9414 * Do a pre-pass to figure out the total amount of space we'll
9415 * need (sz), and see whether all argument are strings.
9416 */
9417 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009418#ifdef Py_DEBUG
9419 use_memcpy = 0;
9420#else
9421 use_memcpy = 1;
9422#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009423 for (i = 0; i < seqlen; i++) {
9424 const Py_ssize_t old_sz = sz;
9425 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009426 if (!PyUnicode_Check(item)) {
9427 PyErr_Format(PyExc_TypeError,
9428 "sequence item %zd: expected str instance,"
9429 " %.80s found",
9430 i, Py_TYPE(item)->tp_name);
9431 goto onError;
9432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 if (PyUnicode_READY(item) == -1)
9434 goto onError;
9435 sz += PyUnicode_GET_LENGTH(item);
9436 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009437 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009438 if (i != 0)
9439 sz += seplen;
9440 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9441 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009442 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009443 goto onError;
9444 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009445 if (use_memcpy && last_obj != NULL) {
9446 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9447 use_memcpy = 0;
9448 }
9449 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009450 }
Tim Petersced69f82003-09-16 20:30:58 +00009451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009453 if (res == NULL)
9454 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009455
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009456 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009457#ifdef Py_DEBUG
9458 use_memcpy = 0;
9459#else
9460 if (use_memcpy) {
9461 res_data = PyUnicode_1BYTE_DATA(res);
9462 kind = PyUnicode_KIND(res);
9463 if (seplen != 0)
9464 sep_data = PyUnicode_1BYTE_DATA(sep);
9465 }
9466#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009467 if (use_memcpy) {
9468 for (i = 0; i < seqlen; ++i) {
9469 Py_ssize_t itemlen;
9470 item = items[i];
9471
9472 /* Copy item, and maybe the separator. */
9473 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009474 Py_MEMCPY(res_data,
9475 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009476 kind * seplen);
9477 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009478 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009479
9480 itemlen = PyUnicode_GET_LENGTH(item);
9481 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009482 Py_MEMCPY(res_data,
9483 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009484 kind * itemlen);
9485 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009486 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009487 }
9488 assert(res_data == PyUnicode_1BYTE_DATA(res)
9489 + kind * PyUnicode_GET_LENGTH(res));
9490 }
9491 else {
9492 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9493 Py_ssize_t itemlen;
9494 item = items[i];
9495
9496 /* Copy item, and maybe the separator. */
9497 if (i && seplen != 0) {
9498 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9499 res_offset += seplen;
9500 }
9501
9502 itemlen = PyUnicode_GET_LENGTH(item);
9503 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009504 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009505 res_offset += itemlen;
9506 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009507 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009508 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009509 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009510
Tim Peters05eba1f2004-08-27 21:32:02 +00009511 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009513 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009517 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009519 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520 return NULL;
9521}
9522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523#define FILL(kind, data, value, start, length) \
9524 do { \
9525 Py_ssize_t i_ = 0; \
9526 assert(kind != PyUnicode_WCHAR_KIND); \
9527 switch ((kind)) { \
9528 case PyUnicode_1BYTE_KIND: { \
9529 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009530 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 break; \
9532 } \
9533 case PyUnicode_2BYTE_KIND: { \
9534 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9535 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9536 break; \
9537 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009538 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9540 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9541 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009542 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 } \
9544 } \
9545 } while (0)
9546
Victor Stinnerd3f08822012-05-29 12:57:52 +02009547void
9548_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9549 Py_UCS4 fill_char)
9550{
9551 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9552 const void *data = PyUnicode_DATA(unicode);
9553 assert(PyUnicode_IS_READY(unicode));
9554 assert(unicode_modifiable(unicode));
9555 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9556 assert(start >= 0);
9557 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9558 FILL(kind, data, fill_char, start, length);
9559}
9560
Victor Stinner3fe55312012-01-04 00:33:50 +01009561Py_ssize_t
9562PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9563 Py_UCS4 fill_char)
9564{
9565 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009566
9567 if (!PyUnicode_Check(unicode)) {
9568 PyErr_BadInternalCall();
9569 return -1;
9570 }
9571 if (PyUnicode_READY(unicode) == -1)
9572 return -1;
9573 if (unicode_check_modifiable(unicode))
9574 return -1;
9575
Victor Stinnerd3f08822012-05-29 12:57:52 +02009576 if (start < 0) {
9577 PyErr_SetString(PyExc_IndexError, "string index out of range");
9578 return -1;
9579 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009580 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9581 PyErr_SetString(PyExc_ValueError,
9582 "fill character is bigger than "
9583 "the string maximum character");
9584 return -1;
9585 }
9586
9587 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9588 length = Py_MIN(maxlen, length);
9589 if (length <= 0)
9590 return 0;
9591
Victor Stinnerd3f08822012-05-29 12:57:52 +02009592 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009593 return length;
9594}
9595
Victor Stinner9310abb2011-10-05 00:59:23 +02009596static PyObject *
9597pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009598 Py_ssize_t left,
9599 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 PyObject *u;
9603 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009604 int kind;
9605 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606
9607 if (left < 0)
9608 left = 0;
9609 if (right < 0)
9610 right = 0;
9611
Victor Stinnerc4b49542011-12-11 22:44:26 +01009612 if (left == 0 && right == 0)
9613 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9616 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009617 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9618 return NULL;
9619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009621 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009623 if (!u)
9624 return NULL;
9625
9626 kind = PyUnicode_KIND(u);
9627 data = PyUnicode_DATA(u);
9628 if (left)
9629 FILL(kind, data, fill, 0, left);
9630 if (right)
9631 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009632 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009633 assert(_PyUnicode_CheckConsistency(u, 1));
9634 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635}
9636
Alexander Belopolsky40018472011-02-26 01:02:56 +00009637PyObject *
9638PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641
9642 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009643 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009644 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009645 if (PyUnicode_READY(string) == -1) {
9646 Py_DECREF(string);
9647 return NULL;
9648 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649
Benjamin Petersonead6b532011-12-20 17:23:42 -06009650 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009652 if (PyUnicode_IS_ASCII(string))
9653 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009654 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009655 PyUnicode_GET_LENGTH(string), keepends);
9656 else
9657 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009658 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009659 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 break;
9661 case PyUnicode_2BYTE_KIND:
9662 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009663 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 PyUnicode_GET_LENGTH(string), keepends);
9665 break;
9666 case PyUnicode_4BYTE_KIND:
9667 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009668 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 PyUnicode_GET_LENGTH(string), keepends);
9670 break;
9671 default:
9672 assert(0);
9673 list = 0;
9674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675 Py_DECREF(string);
9676 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677}
9678
Alexander Belopolsky40018472011-02-26 01:02:56 +00009679static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009680split(PyObject *self,
9681 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009682 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 int kind1, kind2, kind;
9685 void *buf1, *buf2;
9686 Py_ssize_t len1, len2;
9687 PyObject* out;
9688
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009690 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 if (PyUnicode_READY(self) == -1)
9693 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009696 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009698 if (PyUnicode_IS_ASCII(self))
9699 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009700 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009701 PyUnicode_GET_LENGTH(self), maxcount
9702 );
9703 else
9704 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009705 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009706 PyUnicode_GET_LENGTH(self), maxcount
9707 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 case PyUnicode_2BYTE_KIND:
9709 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009710 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 PyUnicode_GET_LENGTH(self), maxcount
9712 );
9713 case PyUnicode_4BYTE_KIND:
9714 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009715 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 PyUnicode_GET_LENGTH(self), maxcount
9717 );
9718 default:
9719 assert(0);
9720 return NULL;
9721 }
9722
9723 if (PyUnicode_READY(substring) == -1)
9724 return NULL;
9725
9726 kind1 = PyUnicode_KIND(self);
9727 kind2 = PyUnicode_KIND(substring);
9728 kind = kind1 > kind2 ? kind1 : kind2;
9729 buf1 = PyUnicode_DATA(self);
9730 buf2 = PyUnicode_DATA(substring);
9731 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009732 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 if (!buf1)
9734 return NULL;
9735 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009736 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 if (!buf2) {
9738 if (kind1 != kind) PyMem_Free(buf1);
9739 return NULL;
9740 }
9741 len1 = PyUnicode_GET_LENGTH(self);
9742 len2 = PyUnicode_GET_LENGTH(substring);
9743
Benjamin Petersonead6b532011-12-20 17:23:42 -06009744 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009746 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9747 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009748 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009749 else
9750 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009751 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 break;
9753 case PyUnicode_2BYTE_KIND:
9754 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009755 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 break;
9757 case PyUnicode_4BYTE_KIND:
9758 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009759 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 break;
9761 default:
9762 out = NULL;
9763 }
9764 if (kind1 != kind)
9765 PyMem_Free(buf1);
9766 if (kind2 != kind)
9767 PyMem_Free(buf2);
9768 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769}
9770
Alexander Belopolsky40018472011-02-26 01:02:56 +00009771static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009772rsplit(PyObject *self,
9773 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009774 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 int kind1, kind2, kind;
9777 void *buf1, *buf2;
9778 Py_ssize_t len1, len2;
9779 PyObject* out;
9780
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009781 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009782 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 if (PyUnicode_READY(self) == -1)
9785 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009788 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009790 if (PyUnicode_IS_ASCII(self))
9791 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009792 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009793 PyUnicode_GET_LENGTH(self), maxcount
9794 );
9795 else
9796 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009797 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009798 PyUnicode_GET_LENGTH(self), maxcount
9799 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 case PyUnicode_2BYTE_KIND:
9801 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009802 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 PyUnicode_GET_LENGTH(self), maxcount
9804 );
9805 case PyUnicode_4BYTE_KIND:
9806 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009807 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 PyUnicode_GET_LENGTH(self), maxcount
9809 );
9810 default:
9811 assert(0);
9812 return NULL;
9813 }
9814
9815 if (PyUnicode_READY(substring) == -1)
9816 return NULL;
9817
9818 kind1 = PyUnicode_KIND(self);
9819 kind2 = PyUnicode_KIND(substring);
9820 kind = kind1 > kind2 ? kind1 : kind2;
9821 buf1 = PyUnicode_DATA(self);
9822 buf2 = PyUnicode_DATA(substring);
9823 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009824 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 if (!buf1)
9826 return NULL;
9827 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009828 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 if (!buf2) {
9830 if (kind1 != kind) PyMem_Free(buf1);
9831 return NULL;
9832 }
9833 len1 = PyUnicode_GET_LENGTH(self);
9834 len2 = PyUnicode_GET_LENGTH(substring);
9835
Benjamin Petersonead6b532011-12-20 17:23:42 -06009836 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009838 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9839 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009840 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009841 else
9842 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009843 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 break;
9845 case PyUnicode_2BYTE_KIND:
9846 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009847 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 break;
9849 case PyUnicode_4BYTE_KIND:
9850 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009851 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 break;
9853 default:
9854 out = NULL;
9855 }
9856 if (kind1 != kind)
9857 PyMem_Free(buf1);
9858 if (kind2 != kind)
9859 PyMem_Free(buf2);
9860 return out;
9861}
9862
9863static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009864anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9865 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009867 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009869 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9870 return asciilib_find(buf1, len1, buf2, len2, offset);
9871 else
9872 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 case PyUnicode_2BYTE_KIND:
9874 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9875 case PyUnicode_4BYTE_KIND:
9876 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9877 }
9878 assert(0);
9879 return -1;
9880}
9881
9882static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009883anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9884 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009886 switch (kind) {
9887 case PyUnicode_1BYTE_KIND:
9888 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9889 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9890 else
9891 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9892 case PyUnicode_2BYTE_KIND:
9893 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9894 case PyUnicode_4BYTE_KIND:
9895 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9896 }
9897 assert(0);
9898 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009899}
9900
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009901static void
9902replace_1char_inplace(PyObject *u, Py_ssize_t pos,
9903 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
9904{
9905 int kind = PyUnicode_KIND(u);
9906 void *data = PyUnicode_DATA(u);
9907 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
9908 if (kind == PyUnicode_1BYTE_KIND) {
9909 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
9910 (Py_UCS1 *)data + len,
9911 u1, u2, maxcount);
9912 }
9913 else if (kind == PyUnicode_2BYTE_KIND) {
9914 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
9915 (Py_UCS2 *)data + len,
9916 u1, u2, maxcount);
9917 }
9918 else {
9919 assert(kind == PyUnicode_4BYTE_KIND);
9920 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
9921 (Py_UCS4 *)data + len,
9922 u1, u2, maxcount);
9923 }
9924}
9925
Alexander Belopolsky40018472011-02-26 01:02:56 +00009926static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927replace(PyObject *self, PyObject *str1,
9928 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 PyObject *u;
9931 char *sbuf = PyUnicode_DATA(self);
9932 char *buf1 = PyUnicode_DATA(str1);
9933 char *buf2 = PyUnicode_DATA(str2);
9934 int srelease = 0, release1 = 0, release2 = 0;
9935 int skind = PyUnicode_KIND(self);
9936 int kind1 = PyUnicode_KIND(str1);
9937 int kind2 = PyUnicode_KIND(str2);
9938 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9939 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9940 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009941 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009942 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943
9944 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009945 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009947 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948
Victor Stinner59de0ee2011-10-07 10:01:28 +02009949 if (str1 == str2)
9950 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951
Victor Stinner49a0a212011-10-12 23:46:10 +02009952 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009953 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
9954 if (maxchar < maxchar_str1)
9955 /* substring too wide to be present */
9956 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +02009957 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9958 /* Replacing str1 with str2 may cause a maxchar reduction in the
9959 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009960 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Victor Stinnere6abb482012-05-02 01:15:40 +02009961 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009964 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009966 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009968 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009969 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009970 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +01009971
Victor Stinner69ed0f42013-04-09 21:48:24 +02009972 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009973 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +01009974 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009975 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +02009976 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009978 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +01009980
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009981 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
9982 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +02009983 }
9984 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 int rkind = skind;
9986 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009987 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 if (kind1 < rkind) {
9990 /* widen substring */
9991 buf1 = _PyUnicode_AsKind(str1, rkind);
9992 if (!buf1) goto error;
9993 release1 = 1;
9994 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009995 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009996 if (i < 0)
9997 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 if (rkind > kind2) {
9999 /* widen replacement */
10000 buf2 = _PyUnicode_AsKind(str2, rkind);
10001 if (!buf2) goto error;
10002 release2 = 1;
10003 }
10004 else if (rkind < kind2) {
10005 /* widen self and buf1 */
10006 rkind = kind2;
10007 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010008 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 sbuf = _PyUnicode_AsKind(self, rkind);
10010 if (!sbuf) goto error;
10011 srelease = 1;
10012 buf1 = _PyUnicode_AsKind(str1, rkind);
10013 if (!buf1) goto error;
10014 release1 = 1;
10015 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010016 u = PyUnicode_New(slen, maxchar);
10017 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010019 assert(PyUnicode_KIND(u) == rkind);
10020 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010021
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010022 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010023 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010024 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010026 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010028
10029 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010030 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010031 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010032 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010033 if (i == -1)
10034 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010035 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010037 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010041 }
10042 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010044 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 int rkind = skind;
10046 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010049 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 buf1 = _PyUnicode_AsKind(str1, rkind);
10051 if (!buf1) goto error;
10052 release1 = 1;
10053 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010054 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010055 if (n == 0)
10056 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010058 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 buf2 = _PyUnicode_AsKind(str2, rkind);
10060 if (!buf2) goto error;
10061 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010064 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 rkind = kind2;
10066 sbuf = _PyUnicode_AsKind(self, rkind);
10067 if (!sbuf) goto error;
10068 srelease = 1;
10069 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010070 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 buf1 = _PyUnicode_AsKind(str1, rkind);
10072 if (!buf1) goto error;
10073 release1 = 1;
10074 }
10075 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10076 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010077 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 PyErr_SetString(PyExc_OverflowError,
10079 "replace string is too long");
10080 goto error;
10081 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010082 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010083 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010084 _Py_INCREF_UNICODE_EMPTY();
10085 if (!unicode_empty)
10086 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010087 u = unicode_empty;
10088 goto done;
10089 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010090 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 PyErr_SetString(PyExc_OverflowError,
10092 "replace string is too long");
10093 goto error;
10094 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010095 u = PyUnicode_New(new_size, maxchar);
10096 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010098 assert(PyUnicode_KIND(u) == rkind);
10099 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 ires = i = 0;
10101 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010102 while (n-- > 0) {
10103 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010104 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010105 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010106 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010107 if (j == -1)
10108 break;
10109 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010110 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010111 memcpy(res + rkind * ires,
10112 sbuf + rkind * i,
10113 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010115 }
10116 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010118 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010120 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010126 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010127 memcpy(res + rkind * ires,
10128 sbuf + rkind * i,
10129 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010130 }
10131 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010132 /* interleave */
10133 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010134 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010136 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010138 if (--n <= 0)
10139 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010140 memcpy(res + rkind * ires,
10141 sbuf + rkind * i,
10142 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 ires++;
10144 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010145 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010146 memcpy(res + rkind * ires,
10147 sbuf + rkind * i,
10148 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010149 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010150 }
10151
10152 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010153 unicode_adjust_maxchar(&u);
10154 if (u == NULL)
10155 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010157
10158 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (srelease)
10160 PyMem_FREE(sbuf);
10161 if (release1)
10162 PyMem_FREE(buf1);
10163 if (release2)
10164 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010165 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010167
Benjamin Peterson29060642009-01-31 22:14:21 +000010168 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010169 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 if (srelease)
10171 PyMem_FREE(sbuf);
10172 if (release1)
10173 PyMem_FREE(buf1);
10174 if (release2)
10175 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010176 return unicode_result_unchanged(self);
10177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 error:
10179 if (srelease && sbuf)
10180 PyMem_FREE(sbuf);
10181 if (release1 && buf1)
10182 PyMem_FREE(buf1);
10183 if (release2 && buf2)
10184 PyMem_FREE(buf2);
10185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186}
10187
10188/* --- Unicode Object Methods --------------------------------------------- */
10189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010190PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192\n\
10193Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010194characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195
10196static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010197unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010199 if (PyUnicode_READY(self) == -1)
10200 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010201 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202}
10203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010204PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010205 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206\n\
10207Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010208have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209
10210static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010211unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010213 if (PyUnicode_READY(self) == -1)
10214 return NULL;
10215 if (PyUnicode_GET_LENGTH(self) == 0)
10216 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010217 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218}
10219
Benjamin Petersond5890c82012-01-14 13:23:30 -050010220PyDoc_STRVAR(casefold__doc__,
10221 "S.casefold() -> str\n\
10222\n\
10223Return a version of S suitable for caseless comparisons.");
10224
10225static PyObject *
10226unicode_casefold(PyObject *self)
10227{
10228 if (PyUnicode_READY(self) == -1)
10229 return NULL;
10230 if (PyUnicode_IS_ASCII(self))
10231 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010232 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010233}
10234
10235
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010236/* Argument converter. Coerces to a single unicode character */
10237
10238static int
10239convert_uc(PyObject *obj, void *addr)
10240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010242 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010243
Benjamin Peterson14339b62009-01-31 16:36:08 +000010244 uniobj = PyUnicode_FromObject(obj);
10245 if (uniobj == NULL) {
10246 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010247 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010248 return 0;
10249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010251 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010252 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010253 Py_DECREF(uniobj);
10254 return 0;
10255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 Py_DECREF(uniobj);
10258 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010259}
10260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010261PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010264Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010265done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266
10267static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010268unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010270 Py_ssize_t marg, left;
10271 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 Py_UCS4 fillchar = ' ';
10273
Victor Stinnere9a29352011-10-01 02:14:59 +020010274 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276
Benjamin Petersonbac79492012-01-14 13:34:47 -050010277 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278 return NULL;
10279
Victor Stinnerc4b49542011-12-11 22:44:26 +010010280 if (PyUnicode_GET_LENGTH(self) >= width)
10281 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282
Victor Stinnerc4b49542011-12-11 22:44:26 +010010283 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284 left = marg / 2 + (marg & width & 1);
10285
Victor Stinner9310abb2011-10-05 00:59:23 +020010286 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287}
10288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289/* This function assumes that str1 and str2 are readied by the caller. */
10290
Marc-André Lemburge5034372000-08-08 08:04:29 +000010291static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010292unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010293{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010294#define COMPARE(TYPE1, TYPE2) \
10295 do { \
10296 TYPE1* p1 = (TYPE1 *)data1; \
10297 TYPE2* p2 = (TYPE2 *)data2; \
10298 TYPE1* end = p1 + len; \
10299 Py_UCS4 c1, c2; \
10300 for (; p1 != end; p1++, p2++) { \
10301 c1 = *p1; \
10302 c2 = *p2; \
10303 if (c1 != c2) \
10304 return (c1 < c2) ? -1 : 1; \
10305 } \
10306 } \
10307 while (0)
10308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 int kind1, kind2;
10310 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010311 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010312
Victor Stinner90db9c42012-10-04 21:53:50 +020010313 /* a string is equal to itself */
10314 if (str1 == str2)
10315 return 0;
10316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 kind1 = PyUnicode_KIND(str1);
10318 kind2 = PyUnicode_KIND(str2);
10319 data1 = PyUnicode_DATA(str1);
10320 data2 = PyUnicode_DATA(str2);
10321 len1 = PyUnicode_GET_LENGTH(str1);
10322 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010323 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010324
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010325 switch(kind1) {
10326 case PyUnicode_1BYTE_KIND:
10327 {
10328 switch(kind2) {
10329 case PyUnicode_1BYTE_KIND:
10330 {
10331 int cmp = memcmp(data1, data2, len);
10332 /* normalize result of memcmp() into the range [-1; 1] */
10333 if (cmp < 0)
10334 return -1;
10335 if (cmp > 0)
10336 return 1;
10337 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010338 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010339 case PyUnicode_2BYTE_KIND:
10340 COMPARE(Py_UCS1, Py_UCS2);
10341 break;
10342 case PyUnicode_4BYTE_KIND:
10343 COMPARE(Py_UCS1, Py_UCS4);
10344 break;
10345 default:
10346 assert(0);
10347 }
10348 break;
10349 }
10350 case PyUnicode_2BYTE_KIND:
10351 {
10352 switch(kind2) {
10353 case PyUnicode_1BYTE_KIND:
10354 COMPARE(Py_UCS2, Py_UCS1);
10355 break;
10356 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010357 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010358 COMPARE(Py_UCS2, Py_UCS2);
10359 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010360 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010361 case PyUnicode_4BYTE_KIND:
10362 COMPARE(Py_UCS2, Py_UCS4);
10363 break;
10364 default:
10365 assert(0);
10366 }
10367 break;
10368 }
10369 case PyUnicode_4BYTE_KIND:
10370 {
10371 switch(kind2) {
10372 case PyUnicode_1BYTE_KIND:
10373 COMPARE(Py_UCS4, Py_UCS1);
10374 break;
10375 case PyUnicode_2BYTE_KIND:
10376 COMPARE(Py_UCS4, Py_UCS2);
10377 break;
10378 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010379 {
10380#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10381 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10382 /* normalize result of wmemcmp() into the range [-1; 1] */
10383 if (cmp < 0)
10384 return -1;
10385 if (cmp > 0)
10386 return 1;
10387#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010388 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010389#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010390 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010391 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010392 default:
10393 assert(0);
10394 }
10395 break;
10396 }
10397 default:
10398 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010399 }
10400
Victor Stinner770e19e2012-10-04 22:59:45 +020010401 if (len1 == len2)
10402 return 0;
10403 if (len1 < len2)
10404 return -1;
10405 else
10406 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010407
10408#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010409}
10410
Victor Stinnere5567ad2012-10-23 02:48:49 +020010411static int
10412unicode_compare_eq(PyObject *str1, PyObject *str2)
10413{
10414 int kind;
10415 void *data1, *data2;
10416 Py_ssize_t len;
10417 int cmp;
10418
10419 /* a string is equal to itself */
10420 if (str1 == str2)
10421 return 1;
10422
10423 len = PyUnicode_GET_LENGTH(str1);
10424 if (PyUnicode_GET_LENGTH(str2) != len)
10425 return 0;
10426 kind = PyUnicode_KIND(str1);
10427 if (PyUnicode_KIND(str2) != kind)
10428 return 0;
10429 data1 = PyUnicode_DATA(str1);
10430 data2 = PyUnicode_DATA(str2);
10431
10432 cmp = memcmp(data1, data2, len * kind);
10433 return (cmp == 0);
10434}
10435
10436
Alexander Belopolsky40018472011-02-26 01:02:56 +000010437int
10438PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10441 if (PyUnicode_READY(left) == -1 ||
10442 PyUnicode_READY(right) == -1)
10443 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010444 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010446 PyErr_Format(PyExc_TypeError,
10447 "Can't compare %.100s and %.100s",
10448 left->ob_type->tp_name,
10449 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450 return -1;
10451}
10452
Martin v. Löwis5b222132007-06-10 09:51:05 +000010453int
10454PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 Py_ssize_t i;
10457 int kind;
10458 void *data;
10459 Py_UCS4 chr;
10460
Victor Stinner910337b2011-10-03 03:20:16 +020010461 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (PyUnicode_READY(uni) == -1)
10463 return -1;
10464 kind = PyUnicode_KIND(uni);
10465 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010466 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10468 if (chr != str[i])
10469 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010470 /* This check keeps Python strings that end in '\0' from comparing equal
10471 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010473 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010474 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010475 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010476 return 0;
10477}
10478
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010479
Benjamin Peterson29060642009-01-31 22:14:21 +000010480#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010481 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010482
Alexander Belopolsky40018472011-02-26 01:02:56 +000010483PyObject *
10484PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010485{
10486 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010487 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010488
Victor Stinnere5567ad2012-10-23 02:48:49 +020010489 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10490 Py_RETURN_NOTIMPLEMENTED;
10491
10492 if (PyUnicode_READY(left) == -1 ||
10493 PyUnicode_READY(right) == -1)
10494 return NULL;
10495
10496 if (op == Py_EQ || op == Py_NE) {
10497 result = unicode_compare_eq(left, right);
10498 if (op == Py_EQ)
10499 v = TEST_COND(result);
10500 else
10501 v = TEST_COND(!result);
10502 }
10503 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010504 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010505
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010506 /* Convert the return value to a Boolean */
10507 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010508 case Py_LE:
10509 v = TEST_COND(result <= 0);
10510 break;
10511 case Py_GE:
10512 v = TEST_COND(result >= 0);
10513 break;
10514 case Py_LT:
10515 v = TEST_COND(result == -1);
10516 break;
10517 case Py_GT:
10518 v = TEST_COND(result == 1);
10519 break;
10520 default:
10521 PyErr_BadArgument();
10522 return NULL;
10523 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010524 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010525 Py_INCREF(v);
10526 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010527}
10528
Alexander Belopolsky40018472011-02-26 01:02:56 +000010529int
10530PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010531{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010532 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010533 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 void *buf1, *buf2;
10535 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010536 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010537
10538 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010539 sub = PyUnicode_FromObject(element);
10540 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010541 PyErr_Format(PyExc_TypeError,
10542 "'in <string>' requires string as left operand, not %s",
10543 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010545 }
10546
Thomas Wouters477c8d52006-05-27 19:21:47 +000010547 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010548 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010549 Py_DECREF(sub);
10550 return -1;
10551 }
10552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 kind1 = PyUnicode_KIND(str);
10554 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 buf1 = PyUnicode_DATA(str);
10556 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010557 if (kind2 != kind1) {
10558 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010559 Py_DECREF(sub);
10560 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010561 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010562 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010563 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 if (!buf2) {
10566 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010567 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 return -1;
10569 }
10570 len1 = PyUnicode_GET_LENGTH(str);
10571 len2 = PyUnicode_GET_LENGTH(sub);
10572
Victor Stinner77282cb2013-04-14 19:22:47 +020010573 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 case PyUnicode_1BYTE_KIND:
10575 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10576 break;
10577 case PyUnicode_2BYTE_KIND:
10578 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10579 break;
10580 case PyUnicode_4BYTE_KIND:
10581 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10582 break;
10583 default:
10584 result = -1;
10585 assert(0);
10586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010587
10588 Py_DECREF(str);
10589 Py_DECREF(sub);
10590
Victor Stinner77282cb2013-04-14 19:22:47 +020010591 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 PyMem_Free(buf2);
10593
Guido van Rossum403d68b2000-03-13 15:55:09 +000010594 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010595}
10596
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597/* Concat to string or Unicode object giving a new Unicode object. */
10598
Alexander Belopolsky40018472011-02-26 01:02:56 +000010599PyObject *
10600PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010603 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010604 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
10606 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010609 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010612 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613
10614 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010615 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010616 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010619 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010620 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622 }
10623
Victor Stinner488fa492011-12-12 00:01:39 +010010624 u_len = PyUnicode_GET_LENGTH(u);
10625 v_len = PyUnicode_GET_LENGTH(v);
10626 if (u_len > PY_SSIZE_T_MAX - v_len) {
10627 PyErr_SetString(PyExc_OverflowError,
10628 "strings are too large to concat");
10629 goto onError;
10630 }
10631 new_len = u_len + v_len;
10632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010634 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010635 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010638 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010640 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010641 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10642 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 Py_DECREF(u);
10644 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010645 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647
Benjamin Peterson29060642009-01-31 22:14:21 +000010648 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649 Py_XDECREF(u);
10650 Py_XDECREF(v);
10651 return NULL;
10652}
10653
Walter Dörwald1ab83302007-05-18 17:15:44 +000010654void
Victor Stinner23e56682011-10-03 03:54:37 +020010655PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010656{
Victor Stinner23e56682011-10-03 03:54:37 +020010657 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010658 Py_UCS4 maxchar, maxchar2;
10659 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010660
10661 if (p_left == NULL) {
10662 if (!PyErr_Occurred())
10663 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010664 return;
10665 }
Victor Stinner23e56682011-10-03 03:54:37 +020010666 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020010667 if (right == NULL || left == NULL
10668 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010669 if (!PyErr_Occurred())
10670 PyErr_BadInternalCall();
10671 goto error;
10672 }
10673
Benjamin Petersonbac79492012-01-14 13:34:47 -050010674 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010675 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010676 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010677 goto error;
10678
Victor Stinner488fa492011-12-12 00:01:39 +010010679 /* Shortcuts */
10680 if (left == unicode_empty) {
10681 Py_DECREF(left);
10682 Py_INCREF(right);
10683 *p_left = right;
10684 return;
10685 }
10686 if (right == unicode_empty)
10687 return;
10688
10689 left_len = PyUnicode_GET_LENGTH(left);
10690 right_len = PyUnicode_GET_LENGTH(right);
10691 if (left_len > PY_SSIZE_T_MAX - right_len) {
10692 PyErr_SetString(PyExc_OverflowError,
10693 "strings are too large to concat");
10694 goto error;
10695 }
10696 new_len = left_len + right_len;
10697
10698 if (unicode_modifiable(left)
10699 && PyUnicode_CheckExact(right)
10700 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010701 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10702 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010703 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010704 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010705 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10706 {
10707 /* append inplace */
Victor Stinnerf0335102013-04-14 19:13:03 +020010708 res = resize_compact(left, new_len);
10709 if (res == NULL)
Victor Stinner488fa492011-12-12 00:01:39 +010010710 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020010711
10712 /* copy 'right' into the newly allocated area of 'res' (left) */
10713 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010714 }
Victor Stinner488fa492011-12-12 00:01:39 +010010715 else {
10716 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10717 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010718 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010719
Victor Stinner488fa492011-12-12 00:01:39 +010010720 /* Concat the two Unicode strings */
10721 res = PyUnicode_New(new_len, maxchar);
10722 if (res == NULL)
10723 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010724 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10725 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010726 Py_DECREF(left);
Victor Stinner488fa492011-12-12 00:01:39 +010010727 }
Victor Stinnerf0335102013-04-14 19:13:03 +020010728 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010010729 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010730 return;
10731
10732error:
Victor Stinner488fa492011-12-12 00:01:39 +010010733 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010734}
10735
10736void
10737PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10738{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010739 PyUnicode_Append(pleft, right);
10740 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010741}
10742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010743PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010744 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010746Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010747string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010748interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
10750static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010751unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010753 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010754 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010755 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 int kind1, kind2, kind;
10758 void *buf1, *buf2;
10759 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760
Jesus Ceaac451502011-04-20 17:09:23 +020010761 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10762 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 kind1 = PyUnicode_KIND(self);
10766 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010767 if (kind2 > kind1)
10768 return PyLong_FromLong(0);
10769 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 buf1 = PyUnicode_DATA(self);
10771 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010773 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 if (!buf2) {
10775 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 return NULL;
10777 }
10778 len1 = PyUnicode_GET_LENGTH(self);
10779 len2 = PyUnicode_GET_LENGTH(substring);
10780
10781 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010782 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 case PyUnicode_1BYTE_KIND:
10784 iresult = ucs1lib_count(
10785 ((Py_UCS1*)buf1) + start, end - start,
10786 buf2, len2, PY_SSIZE_T_MAX
10787 );
10788 break;
10789 case PyUnicode_2BYTE_KIND:
10790 iresult = ucs2lib_count(
10791 ((Py_UCS2*)buf1) + start, end - start,
10792 buf2, len2, PY_SSIZE_T_MAX
10793 );
10794 break;
10795 case PyUnicode_4BYTE_KIND:
10796 iresult = ucs4lib_count(
10797 ((Py_UCS4*)buf1) + start, end - start,
10798 buf2, len2, PY_SSIZE_T_MAX
10799 );
10800 break;
10801 default:
10802 assert(0); iresult = 0;
10803 }
10804
10805 result = PyLong_FromSsize_t(iresult);
10806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 if (kind2 != kind)
10808 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809
10810 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010811
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812 return result;
10813}
10814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010815PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010816 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010818Encode S using the codec registered for encoding. Default encoding\n\
10819is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010820handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010821a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10822'xmlcharrefreplace' as well as any other name registered with\n\
10823codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
10825static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010826unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010828 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829 char *encoding = NULL;
10830 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010831
Benjamin Peterson308d6372009-09-18 21:42:35 +000010832 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10833 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010835 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010836}
10837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010838PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010839 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840\n\
10841Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010842If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843
10844static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010845unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010847 Py_ssize_t i, j, line_pos, src_len, incr;
10848 Py_UCS4 ch;
10849 PyObject *u;
10850 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010852 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010853 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854
10855 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857
Antoine Pitrou22425222011-10-04 19:10:51 +020010858 if (PyUnicode_READY(self) == -1)
10859 return NULL;
10860
Thomas Wouters7e474022000-07-16 12:04:32 +000010861 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 src_len = PyUnicode_GET_LENGTH(self);
10863 i = j = line_pos = 0;
10864 kind = PyUnicode_KIND(self);
10865 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010866 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010867 for (; i < src_len; i++) {
10868 ch = PyUnicode_READ(kind, src_data, i);
10869 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010870 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010873 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010874 goto overflow;
10875 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010877 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010880 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010881 goto overflow;
10882 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 if (ch == '\n' || ch == '\r')
10885 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010887 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010888 if (!found)
10889 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010890
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010892 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 if (!u)
10894 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896
Antoine Pitroue71d5742011-10-04 15:55:09 +020010897 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898
Antoine Pitroue71d5742011-10-04 15:55:09 +020010899 for (; i < src_len; i++) {
10900 ch = PyUnicode_READ(kind, src_data, i);
10901 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010903 incr = tabsize - (line_pos % tabsize);
10904 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010905 FILL(kind, dest_data, ' ', j, incr);
10906 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010907 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010908 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010910 line_pos++;
10911 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010912 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010913 if (ch == '\n' || ch == '\r')
10914 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010916 }
10917 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010918 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010919
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010921 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923}
10924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010925PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010926 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927\n\
10928Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010929such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930arguments start and end are interpreted as in slice notation.\n\
10931\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010932Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933
10934static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010937 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010938 Py_ssize_t start;
10939 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010940 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941
Jesus Ceaac451502011-04-20 17:09:23 +020010942 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10943 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 if (PyUnicode_READY(self) == -1)
10947 return NULL;
10948 if (PyUnicode_READY(substring) == -1)
10949 return NULL;
10950
Victor Stinner7931d9a2011-11-04 00:22:48 +010010951 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952
10953 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 if (result == -2)
10956 return NULL;
10957
Christian Heimes217cfd12007-12-02 14:31:20 +000010958 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959}
10960
10961static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010962unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010964 void *data;
10965 enum PyUnicode_Kind kind;
10966 Py_UCS4 ch;
10967 PyObject *res;
10968
10969 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10970 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010972 }
10973 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10974 PyErr_SetString(PyExc_IndexError, "string index out of range");
10975 return NULL;
10976 }
10977 kind = PyUnicode_KIND(self);
10978 data = PyUnicode_DATA(self);
10979 ch = PyUnicode_READ(kind, data, index);
10980 if (ch < 256)
10981 return get_latin1_char(ch);
10982
10983 res = PyUnicode_New(1, ch);
10984 if (res == NULL)
10985 return NULL;
10986 kind = PyUnicode_KIND(res);
10987 data = PyUnicode_DATA(res);
10988 PyUnicode_WRITE(kind, data, 0, ch);
10989 assert(_PyUnicode_CheckConsistency(res, 1));
10990 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991}
10992
Guido van Rossumc2504932007-09-18 19:42:40 +000010993/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010994 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010995static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010996unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997{
Guido van Rossumc2504932007-09-18 19:42:40 +000010998 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080010999 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011000
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011001#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011002 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011003#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 if (_PyUnicode_HASH(self) != -1)
11005 return _PyUnicode_HASH(self);
11006 if (PyUnicode_READY(self) == -1)
11007 return -1;
11008 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011009 /*
11010 We make the hash of the empty string be 0, rather than using
11011 (prefix ^ suffix), since this slightly obfuscates the hash secret
11012 */
11013 if (len == 0) {
11014 _PyUnicode_HASH(self) = 0;
11015 return 0;
11016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017
11018 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011019#define HASH(P) \
11020 x ^= (Py_uhash_t) *P << 7; \
11021 while (--len >= 0) \
11022 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023
Georg Brandl2fb477c2012-02-21 00:33:36 +010011024 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 switch (PyUnicode_KIND(self)) {
11026 case PyUnicode_1BYTE_KIND: {
11027 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11028 HASH(c);
11029 break;
11030 }
11031 case PyUnicode_2BYTE_KIND: {
11032 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11033 HASH(s);
11034 break;
11035 }
11036 default: {
11037 Py_UCS4 *l;
11038 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11039 "Impossible switch case in unicode_hash");
11040 l = PyUnicode_4BYTE_DATA(self);
11041 HASH(l);
11042 break;
11043 }
11044 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011045 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11046 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047
Guido van Rossumc2504932007-09-18 19:42:40 +000011048 if (x == -1)
11049 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011051 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011055PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011058Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059
11060static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011063 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011064 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011065 Py_ssize_t start;
11066 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
Jesus Ceaac451502011-04-20 17:09:23 +020011068 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11069 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 if (PyUnicode_READY(self) == -1)
11073 return NULL;
11074 if (PyUnicode_READY(substring) == -1)
11075 return NULL;
11076
Victor Stinner7931d9a2011-11-04 00:22:48 +010011077 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078
11079 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 if (result == -2)
11082 return NULL;
11083
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084 if (result < 0) {
11085 PyErr_SetString(PyExc_ValueError, "substring not found");
11086 return NULL;
11087 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011088
Christian Heimes217cfd12007-12-02 14:31:20 +000011089 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090}
11091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011092PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011095Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011096at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097
11098static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011099unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 Py_ssize_t i, length;
11102 int kind;
11103 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104 int cased;
11105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 if (PyUnicode_READY(self) == -1)
11107 return NULL;
11108 length = PyUnicode_GET_LENGTH(self);
11109 kind = PyUnicode_KIND(self);
11110 data = PyUnicode_DATA(self);
11111
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 if (length == 1)
11114 return PyBool_FromLong(
11115 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011117 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011119 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011120
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011122 for (i = 0; i < length; i++) {
11123 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011124
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11126 return PyBool_FromLong(0);
11127 else if (!cased && Py_UNICODE_ISLOWER(ch))
11128 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011130 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131}
11132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011133PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011134 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011136Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011137at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138
11139static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011140unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 Py_ssize_t i, length;
11143 int kind;
11144 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145 int cased;
11146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 if (PyUnicode_READY(self) == -1)
11148 return NULL;
11149 length = PyUnicode_GET_LENGTH(self);
11150 kind = PyUnicode_KIND(self);
11151 data = PyUnicode_DATA(self);
11152
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 if (length == 1)
11155 return PyBool_FromLong(
11156 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011158 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011160 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011161
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 for (i = 0; i < length; i++) {
11164 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011165
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11167 return PyBool_FromLong(0);
11168 else if (!cased && Py_UNICODE_ISUPPER(ch))
11169 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011171 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172}
11173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011174PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011175 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011177Return True if S is a titlecased string and there is at least one\n\
11178character in S, i.e. upper- and titlecase characters may only\n\
11179follow uncased characters and lowercase characters only cased ones.\n\
11180Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181
11182static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011183unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 Py_ssize_t i, length;
11186 int kind;
11187 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 int cased, previous_is_cased;
11189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 if (PyUnicode_READY(self) == -1)
11191 return NULL;
11192 length = PyUnicode_GET_LENGTH(self);
11193 kind = PyUnicode_KIND(self);
11194 data = PyUnicode_DATA(self);
11195
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 if (length == 1) {
11198 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11199 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11200 (Py_UNICODE_ISUPPER(ch) != 0));
11201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011203 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011205 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011206
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 cased = 0;
11208 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 for (i = 0; i < length; i++) {
11210 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011211
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11213 if (previous_is_cased)
11214 return PyBool_FromLong(0);
11215 previous_is_cased = 1;
11216 cased = 1;
11217 }
11218 else if (Py_UNICODE_ISLOWER(ch)) {
11219 if (!previous_is_cased)
11220 return PyBool_FromLong(0);
11221 previous_is_cased = 1;
11222 cased = 1;
11223 }
11224 else
11225 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011227 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228}
11229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011230PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011233Return True if all characters in S are whitespace\n\
11234and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
11236static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011237unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 Py_ssize_t i, length;
11240 int kind;
11241 void *data;
11242
11243 if (PyUnicode_READY(self) == -1)
11244 return NULL;
11245 length = PyUnicode_GET_LENGTH(self);
11246 kind = PyUnicode_KIND(self);
11247 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 if (length == 1)
11251 return PyBool_FromLong(
11252 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011254 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 for (i = 0; i < length; i++) {
11259 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011260 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011263 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264}
11265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011266PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011268\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011269Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011270and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011271
11272static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011273unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 Py_ssize_t i, length;
11276 int kind;
11277 void *data;
11278
11279 if (PyUnicode_READY(self) == -1)
11280 return NULL;
11281 length = PyUnicode_GET_LENGTH(self);
11282 kind = PyUnicode_KIND(self);
11283 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011284
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011285 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 if (length == 1)
11287 return PyBool_FromLong(
11288 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011289
11290 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 for (i = 0; i < length; i++) {
11295 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011297 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011298 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299}
11300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011301PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011303\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011304Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011306
11307static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011308unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 int kind;
11311 void *data;
11312 Py_ssize_t len, i;
11313
11314 if (PyUnicode_READY(self) == -1)
11315 return NULL;
11316
11317 kind = PyUnicode_KIND(self);
11318 data = PyUnicode_DATA(self);
11319 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011320
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011321 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 if (len == 1) {
11323 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11324 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11325 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011326
11327 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011329 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 for (i = 0; i < len; i++) {
11332 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011333 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011334 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011335 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011336 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011337}
11338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011339PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011340 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011342Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011343False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344
11345static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011346unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 Py_ssize_t i, length;
11349 int kind;
11350 void *data;
11351
11352 if (PyUnicode_READY(self) == -1)
11353 return NULL;
11354 length = PyUnicode_GET_LENGTH(self);
11355 kind = PyUnicode_KIND(self);
11356 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 if (length == 1)
11360 return PyBool_FromLong(
11361 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011363 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 for (i = 0; i < length; i++) {
11368 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011371 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372}
11373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011374PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011377Return True if all characters in S are digits\n\
11378and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379
11380static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011381unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 Py_ssize_t i, length;
11384 int kind;
11385 void *data;
11386
11387 if (PyUnicode_READY(self) == -1)
11388 return NULL;
11389 length = PyUnicode_GET_LENGTH(self);
11390 kind = PyUnicode_KIND(self);
11391 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 if (length == 1) {
11395 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11396 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011399 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 for (i = 0; i < length; i++) {
11404 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011407 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408}
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011413Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
11416static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011417unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 Py_ssize_t i, length;
11420 int kind;
11421 void *data;
11422
11423 if (PyUnicode_READY(self) == -1)
11424 return NULL;
11425 length = PyUnicode_GET_LENGTH(self);
11426 kind = PyUnicode_KIND(self);
11427 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 if (length == 1)
11431 return PyBool_FromLong(
11432 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011434 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 for (i = 0; i < length; i++) {
11439 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011442 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443}
11444
Martin v. Löwis47383402007-08-15 07:32:56 +000011445int
11446PyUnicode_IsIdentifier(PyObject *self)
11447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 int kind;
11449 void *data;
11450 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011451 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 if (PyUnicode_READY(self) == -1) {
11454 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 }
11457
11458 /* Special case for empty strings */
11459 if (PyUnicode_GET_LENGTH(self) == 0)
11460 return 0;
11461 kind = PyUnicode_KIND(self);
11462 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011463
11464 /* PEP 3131 says that the first character must be in
11465 XID_Start and subsequent characters in XID_Continue,
11466 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011467 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011468 letters, digits, underscore). However, given the current
11469 definition of XID_Start and XID_Continue, it is sufficient
11470 to check just for these, except that _ must be allowed
11471 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011473 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011474 return 0;
11475
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011476 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011479 return 1;
11480}
11481
11482PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011484\n\
11485Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011486to the language definition.\n\
11487\n\
11488Use keyword.iskeyword() to test for reserved identifiers\n\
11489such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011490
11491static PyObject*
11492unicode_isidentifier(PyObject *self)
11493{
11494 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11495}
11496
Georg Brandl559e5d72008-06-11 18:37:52 +000011497PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011499\n\
11500Return True if all characters in S are considered\n\
11501printable in repr() or S is empty, False otherwise.");
11502
11503static PyObject*
11504unicode_isprintable(PyObject *self)
11505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 Py_ssize_t i, length;
11507 int kind;
11508 void *data;
11509
11510 if (PyUnicode_READY(self) == -1)
11511 return NULL;
11512 length = PyUnicode_GET_LENGTH(self);
11513 kind = PyUnicode_KIND(self);
11514 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011515
11516 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (length == 1)
11518 return PyBool_FromLong(
11519 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 for (i = 0; i < length; i++) {
11522 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011523 Py_RETURN_FALSE;
11524 }
11525 }
11526 Py_RETURN_TRUE;
11527}
11528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011529PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011530 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531\n\
11532Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011533iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
11535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011536unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011538 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539}
11540
Martin v. Löwis18e16552006-02-15 17:27:45 +000011541static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011542unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 if (PyUnicode_READY(self) == -1)
11545 return -1;
11546 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547}
11548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011549PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011552Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011553done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
11555static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011556unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011558 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 Py_UCS4 fillchar = ' ';
11560
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011561 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 return NULL;
11563
Benjamin Petersonbac79492012-01-14 13:34:47 -050011564 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566
Victor Stinnerc4b49542011-12-11 22:44:26 +010011567 if (PyUnicode_GET_LENGTH(self) >= width)
11568 return unicode_result_unchanged(self);
11569
11570 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571}
11572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011573PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011576Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
11578static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011579unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011581 if (PyUnicode_READY(self) == -1)
11582 return NULL;
11583 if (PyUnicode_IS_ASCII(self))
11584 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011585 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586}
11587
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011588#define LEFTSTRIP 0
11589#define RIGHTSTRIP 1
11590#define BOTHSTRIP 2
11591
11592/* Arrays indexed by above */
11593static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11594
11595#define STRIPNAME(i) (stripformat[i]+3)
11596
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011597/* externally visible for str.strip(unicode) */
11598PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011599_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011600{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 void *data;
11602 int kind;
11603 Py_ssize_t i, j, len;
11604 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011605 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11608 return NULL;
11609
11610 kind = PyUnicode_KIND(self);
11611 data = PyUnicode_DATA(self);
11612 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011613 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11615 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011616 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011617
Benjamin Peterson14339b62009-01-31 16:36:08 +000011618 i = 0;
11619 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011620 while (i < len) {
11621 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11622 if (!BLOOM(sepmask, ch))
11623 break;
11624 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11625 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 i++;
11627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011628 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011629
Benjamin Peterson14339b62009-01-31 16:36:08 +000011630 j = len;
11631 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011632 j--;
11633 while (j >= i) {
11634 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11635 if (!BLOOM(sepmask, ch))
11636 break;
11637 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11638 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011640 }
11641
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011643 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011644
Victor Stinner7931d9a2011-11-04 00:22:48 +010011645 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646}
11647
11648PyObject*
11649PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11650{
11651 unsigned char *data;
11652 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011653 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654
Victor Stinnerde636f32011-10-01 03:55:54 +020011655 if (PyUnicode_READY(self) == -1)
11656 return NULL;
11657
Victor Stinner684d5fd2012-05-03 02:32:34 +020011658 length = PyUnicode_GET_LENGTH(self);
11659 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011660
Victor Stinner684d5fd2012-05-03 02:32:34 +020011661 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011662 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663
Victor Stinnerde636f32011-10-01 03:55:54 +020011664 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011665 PyErr_SetString(PyExc_IndexError, "string index out of range");
11666 return NULL;
11667 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011668 if (start >= length || end < start)
11669 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011670
Victor Stinner684d5fd2012-05-03 02:32:34 +020011671 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011672 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011673 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011674 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011675 }
11676 else {
11677 kind = PyUnicode_KIND(self);
11678 data = PyUnicode_1BYTE_DATA(self);
11679 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011680 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011681 length);
11682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684
11685static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011686do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 Py_ssize_t len, i, j;
11689
11690 if (PyUnicode_READY(self) == -1)
11691 return NULL;
11692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694
Victor Stinnercc7af722013-04-09 22:39:24 +020011695 if (PyUnicode_IS_ASCII(self)) {
11696 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11697
11698 i = 0;
11699 if (striptype != RIGHTSTRIP) {
11700 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011701 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020011702 if (!_Py_ascii_whitespace[ch])
11703 break;
11704 i++;
11705 }
11706 }
11707
11708 j = len;
11709 if (striptype != LEFTSTRIP) {
11710 j--;
11711 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011712 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020011713 if (!_Py_ascii_whitespace[ch])
11714 break;
11715 j--;
11716 }
11717 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011718 }
11719 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011720 else {
11721 int kind = PyUnicode_KIND(self);
11722 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011723
Victor Stinnercc7af722013-04-09 22:39:24 +020011724 i = 0;
11725 if (striptype != RIGHTSTRIP) {
11726 while (i < len) {
11727 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11728 if (!Py_UNICODE_ISSPACE(ch))
11729 break;
11730 i++;
11731 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011732 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011733
11734 j = len;
11735 if (striptype != LEFTSTRIP) {
11736 j--;
11737 while (j >= i) {
11738 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11739 if (!Py_UNICODE_ISSPACE(ch))
11740 break;
11741 j--;
11742 }
11743 j++;
11744 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011745 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011746
Victor Stinner7931d9a2011-11-04 00:22:48 +010011747 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748}
11749
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750
11751static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011752do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011753{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011754 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755
Benjamin Peterson14339b62009-01-31 16:36:08 +000011756 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11757 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011758
Benjamin Peterson14339b62009-01-31 16:36:08 +000011759 if (sep != NULL && sep != Py_None) {
11760 if (PyUnicode_Check(sep))
11761 return _PyUnicode_XStrip(self, striptype, sep);
11762 else {
11763 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 "%s arg must be None or str",
11765 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011766 return NULL;
11767 }
11768 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011769
Benjamin Peterson14339b62009-01-31 16:36:08 +000011770 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011771}
11772
11773
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011774PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011776\n\
11777Return a copy of the string S with leading and trailing\n\
11778whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011779If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011780
11781static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011782unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011783{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011784 if (PyTuple_GET_SIZE(args) == 0)
11785 return do_strip(self, BOTHSTRIP); /* Common case */
11786 else
11787 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011788}
11789
11790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011791PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011793\n\
11794Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011795If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011796
11797static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011798unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011799{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 if (PyTuple_GET_SIZE(args) == 0)
11801 return do_strip(self, LEFTSTRIP); /* Common case */
11802 else
11803 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011804}
11805
11806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011807PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011808 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011809\n\
11810Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011811If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812
11813static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011814unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011816 if (PyTuple_GET_SIZE(args) == 0)
11817 return do_strip(self, RIGHTSTRIP); /* Common case */
11818 else
11819 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011820}
11821
11822
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011824unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011826 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828
Serhiy Storchaka05997252013-01-26 12:14:02 +020011829 if (len < 1)
11830 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
Victor Stinnerc4b49542011-12-11 22:44:26 +010011832 /* no repeat, return original string */
11833 if (len == 1)
11834 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011835
Benjamin Petersonbac79492012-01-14 13:34:47 -050011836 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 return NULL;
11838
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011839 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011840 PyErr_SetString(PyExc_OverflowError,
11841 "repeated string is too long");
11842 return NULL;
11843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011845
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011846 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 if (!u)
11848 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011849 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 if (PyUnicode_GET_LENGTH(str) == 1) {
11852 const int kind = PyUnicode_KIND(str);
11853 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011854 if (kind == PyUnicode_1BYTE_KIND) {
11855 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011856 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011857 }
11858 else if (kind == PyUnicode_2BYTE_KIND) {
11859 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011860 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011861 ucs2[n] = fill_char;
11862 } else {
11863 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11864 assert(kind == PyUnicode_4BYTE_KIND);
11865 for (n = 0; n < len; ++n)
11866 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 }
11869 else {
11870 /* number of characters copied this far */
11871 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011872 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 char *to = (char *) PyUnicode_DATA(u);
11874 Py_MEMCPY(to, PyUnicode_DATA(str),
11875 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011876 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 n = (done <= nchars-done) ? done : nchars-done;
11878 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011879 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881 }
11882
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011883 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011884 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885}
11886
Alexander Belopolsky40018472011-02-26 01:02:56 +000011887PyObject *
11888PyUnicode_Replace(PyObject *obj,
11889 PyObject *subobj,
11890 PyObject *replobj,
11891 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892{
11893 PyObject *self;
11894 PyObject *str1;
11895 PyObject *str2;
11896 PyObject *result;
11897
11898 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011899 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011902 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 Py_DECREF(self);
11904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905 }
11906 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011907 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 Py_DECREF(self);
11909 Py_DECREF(str1);
11910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011912 if (PyUnicode_READY(self) == -1 ||
11913 PyUnicode_READY(str1) == -1 ||
11914 PyUnicode_READY(str2) == -1)
11915 result = NULL;
11916 else
11917 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918 Py_DECREF(self);
11919 Py_DECREF(str1);
11920 Py_DECREF(str2);
11921 return result;
11922}
11923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011924PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011925 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926\n\
11927Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011928old replaced by new. If the optional argument count is\n\
11929given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930
11931static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 PyObject *str1;
11935 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011936 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 PyObject *result;
11938
Martin v. Löwis18e16552006-02-15 17:27:45 +000011939 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011941 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011944 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 return NULL;
11946 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011947 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 Py_DECREF(str1);
11949 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011950 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011951 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11952 result = NULL;
11953 else
11954 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
11956 Py_DECREF(str1);
11957 Py_DECREF(str2);
11958 return result;
11959}
11960
Alexander Belopolsky40018472011-02-26 01:02:56 +000011961static PyObject *
11962unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011964 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 Py_ssize_t isize;
11966 Py_ssize_t osize, squote, dquote, i, o;
11967 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020011968 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011972 return NULL;
11973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 isize = PyUnicode_GET_LENGTH(unicode);
11975 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 /* Compute length of output, quote characters, and
11978 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020011979 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 max = 127;
11981 squote = dquote = 0;
11982 ikind = PyUnicode_KIND(unicode);
11983 for (i = 0; i < isize; i++) {
11984 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11985 switch (ch) {
11986 case '\'': squote++; osize++; break;
11987 case '"': dquote++; osize++; break;
11988 case '\\': case '\t': case '\r': case '\n':
11989 osize += 2; break;
11990 default:
11991 /* Fast-path ASCII */
11992 if (ch < ' ' || ch == 0x7f)
11993 osize += 4; /* \xHH */
11994 else if (ch < 0x7f)
11995 osize++;
11996 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11997 osize++;
11998 max = ch > max ? ch : max;
11999 }
12000 else if (ch < 0x100)
12001 osize += 4; /* \xHH */
12002 else if (ch < 0x10000)
12003 osize += 6; /* \uHHHH */
12004 else
12005 osize += 10; /* \uHHHHHHHH */
12006 }
12007 }
12008
12009 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012010 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012012 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (dquote)
12014 /* Both squote and dquote present. Use squote,
12015 and escape them */
12016 osize += squote;
12017 else
12018 quote = '"';
12019 }
Victor Stinner55c08782013-04-14 18:45:39 +020012020 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021
12022 repr = PyUnicode_New(osize, max);
12023 if (repr == NULL)
12024 return NULL;
12025 okind = PyUnicode_KIND(repr);
12026 odata = PyUnicode_DATA(repr);
12027
12028 PyUnicode_WRITE(okind, odata, 0, quote);
12029 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012030 if (unchanged) {
12031 _PyUnicode_FastCopyCharacters(repr, 1,
12032 unicode, 0,
12033 isize);
12034 }
12035 else {
12036 for (i = 0, o = 1; i < isize; i++) {
12037 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038
Victor Stinner55c08782013-04-14 18:45:39 +020012039 /* Escape quotes and backslashes */
12040 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012041 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012043 continue;
12044 }
12045
12046 /* Map special whitespace to '\t', \n', '\r' */
12047 if (ch == '\t') {
12048 PyUnicode_WRITE(okind, odata, o++, '\\');
12049 PyUnicode_WRITE(okind, odata, o++, 't');
12050 }
12051 else if (ch == '\n') {
12052 PyUnicode_WRITE(okind, odata, o++, '\\');
12053 PyUnicode_WRITE(okind, odata, o++, 'n');
12054 }
12055 else if (ch == '\r') {
12056 PyUnicode_WRITE(okind, odata, o++, '\\');
12057 PyUnicode_WRITE(okind, odata, o++, 'r');
12058 }
12059
12060 /* Map non-printable US ASCII to '\xhh' */
12061 else if (ch < ' ' || ch == 0x7F) {
12062 PyUnicode_WRITE(okind, odata, o++, '\\');
12063 PyUnicode_WRITE(okind, odata, o++, 'x');
12064 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12065 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12066 }
12067
12068 /* Copy ASCII characters as-is */
12069 else if (ch < 0x7F) {
12070 PyUnicode_WRITE(okind, odata, o++, ch);
12071 }
12072
12073 /* Non-ASCII characters */
12074 else {
12075 /* Map Unicode whitespace and control characters
12076 (categories Z* and C* except ASCII space)
12077 */
12078 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12079 PyUnicode_WRITE(okind, odata, o++, '\\');
12080 /* Map 8-bit characters to '\xhh' */
12081 if (ch <= 0xff) {
12082 PyUnicode_WRITE(okind, odata, o++, 'x');
12083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12084 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12085 }
12086 /* Map 16-bit characters to '\uxxxx' */
12087 else if (ch <= 0xffff) {
12088 PyUnicode_WRITE(okind, odata, o++, 'u');
12089 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12090 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12093 }
12094 /* Map 21-bit characters to '\U00xxxxxx' */
12095 else {
12096 PyUnicode_WRITE(okind, odata, o++, 'U');
12097 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12098 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12099 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12100 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12101 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12105 }
12106 }
12107 /* Copy characters as-is */
12108 else {
12109 PyUnicode_WRITE(okind, odata, o++, ch);
12110 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012111 }
12112 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012115 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012116 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117}
12118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012119PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121\n\
12122Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012123such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124arguments start and end are interpreted as in slice notation.\n\
12125\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
12128static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012131 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012132 Py_ssize_t start;
12133 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012134 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135
Jesus Ceaac451502011-04-20 17:09:23 +020012136 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12137 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (PyUnicode_READY(self) == -1)
12141 return NULL;
12142 if (PyUnicode_READY(substring) == -1)
12143 return NULL;
12144
Victor Stinner7931d9a2011-11-04 00:22:48 +010012145 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
12147 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (result == -2)
12150 return NULL;
12151
Christian Heimes217cfd12007-12-02 14:31:20 +000012152 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153}
12154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012155PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012156 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012158Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
12160static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012163 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012164 Py_ssize_t start;
12165 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012166 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
Jesus Ceaac451502011-04-20 17:09:23 +020012168 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12169 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 if (PyUnicode_READY(self) == -1)
12173 return NULL;
12174 if (PyUnicode_READY(substring) == -1)
12175 return NULL;
12176
Victor Stinner7931d9a2011-11-04 00:22:48 +010012177 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
12179 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 if (result == -2)
12182 return NULL;
12183
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 if (result < 0) {
12185 PyErr_SetString(PyExc_ValueError, "substring not found");
12186 return NULL;
12187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188
Christian Heimes217cfd12007-12-02 14:31:20 +000012189 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190}
12191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012192PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012193 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012195Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012196done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
12198static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012199unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012201 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 Py_UCS4 fillchar = ' ';
12203
Victor Stinnere9a29352011-10-01 02:14:59 +020012204 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012206
Benjamin Petersonbac79492012-01-14 13:34:47 -050012207 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208 return NULL;
12209
Victor Stinnerc4b49542011-12-11 22:44:26 +010012210 if (PyUnicode_GET_LENGTH(self) >= width)
12211 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
Victor Stinnerc4b49542011-12-11 22:44:26 +010012213 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214}
12215
Alexander Belopolsky40018472011-02-26 01:02:56 +000012216PyObject *
12217PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218{
12219 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012220
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221 s = PyUnicode_FromObject(s);
12222 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012223 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012224 if (sep != NULL) {
12225 sep = PyUnicode_FromObject(sep);
12226 if (sep == NULL) {
12227 Py_DECREF(s);
12228 return NULL;
12229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230 }
12231
Victor Stinner9310abb2011-10-05 00:59:23 +020012232 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
12234 Py_DECREF(s);
12235 Py_XDECREF(sep);
12236 return result;
12237}
12238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012239PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012240 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241\n\
12242Return a list of the words in S, using sep as the\n\
12243delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012244splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012245whitespace string is a separator and empty strings are\n\
12246removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247
12248static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012249unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012251 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012253 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012255 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12256 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 return NULL;
12258
12259 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012262 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012264 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
Thomas Wouters477c8d52006-05-27 19:21:47 +000012267PyObject *
12268PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12269{
12270 PyObject* str_obj;
12271 PyObject* sep_obj;
12272 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 int kind1, kind2, kind;
12274 void *buf1 = NULL, *buf2 = NULL;
12275 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012276
12277 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012278 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012280 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012281 if (!sep_obj) {
12282 Py_DECREF(str_obj);
12283 return NULL;
12284 }
12285 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12286 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012287 Py_DECREF(str_obj);
12288 return NULL;
12289 }
12290
Victor Stinner14f8f022011-10-05 20:58:25 +020012291 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012293 kind = Py_MAX(kind1, kind2);
12294 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012296 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 if (!buf1)
12298 goto onError;
12299 buf2 = PyUnicode_DATA(sep_obj);
12300 if (kind2 != kind)
12301 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12302 if (!buf2)
12303 goto onError;
12304 len1 = PyUnicode_GET_LENGTH(str_obj);
12305 len2 = PyUnicode_GET_LENGTH(sep_obj);
12306
Benjamin Petersonead6b532011-12-20 17:23:42 -060012307 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012309 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12310 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12311 else
12312 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 break;
12314 case PyUnicode_2BYTE_KIND:
12315 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12316 break;
12317 case PyUnicode_4BYTE_KIND:
12318 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12319 break;
12320 default:
12321 assert(0);
12322 out = 0;
12323 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012324
12325 Py_DECREF(sep_obj);
12326 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 if (kind1 != kind)
12328 PyMem_Free(buf1);
12329 if (kind2 != kind)
12330 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012331
12332 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 onError:
12334 Py_DECREF(sep_obj);
12335 Py_DECREF(str_obj);
12336 if (kind1 != kind && buf1)
12337 PyMem_Free(buf1);
12338 if (kind2 != kind && buf2)
12339 PyMem_Free(buf2);
12340 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012341}
12342
12343
12344PyObject *
12345PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12346{
12347 PyObject* str_obj;
12348 PyObject* sep_obj;
12349 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 int kind1, kind2, kind;
12351 void *buf1 = NULL, *buf2 = NULL;
12352 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012353
12354 str_obj = PyUnicode_FromObject(str_in);
12355 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012357 sep_obj = PyUnicode_FromObject(sep_in);
12358 if (!sep_obj) {
12359 Py_DECREF(str_obj);
12360 return NULL;
12361 }
12362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 kind1 = PyUnicode_KIND(str_in);
12364 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012365 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 buf1 = PyUnicode_DATA(str_in);
12367 if (kind1 != kind)
12368 buf1 = _PyUnicode_AsKind(str_in, kind);
12369 if (!buf1)
12370 goto onError;
12371 buf2 = PyUnicode_DATA(sep_obj);
12372 if (kind2 != kind)
12373 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12374 if (!buf2)
12375 goto onError;
12376 len1 = PyUnicode_GET_LENGTH(str_obj);
12377 len2 = PyUnicode_GET_LENGTH(sep_obj);
12378
Benjamin Petersonead6b532011-12-20 17:23:42 -060012379 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012381 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12382 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12383 else
12384 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 break;
12386 case PyUnicode_2BYTE_KIND:
12387 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12388 break;
12389 case PyUnicode_4BYTE_KIND:
12390 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12391 break;
12392 default:
12393 assert(0);
12394 out = 0;
12395 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012396
12397 Py_DECREF(sep_obj);
12398 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 if (kind1 != kind)
12400 PyMem_Free(buf1);
12401 if (kind2 != kind)
12402 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012403
12404 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 onError:
12406 Py_DECREF(sep_obj);
12407 Py_DECREF(str_obj);
12408 if (kind1 != kind && buf1)
12409 PyMem_Free(buf1);
12410 if (kind2 != kind && buf2)
12411 PyMem_Free(buf2);
12412 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012413}
12414
12415PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012417\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012418Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012420found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012421
12422static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012423unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012424{
Victor Stinner9310abb2011-10-05 00:59:23 +020012425 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012426}
12427
12428PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012429 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012430\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012431Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012432the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012433separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012434
12435static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012436unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012437{
Victor Stinner9310abb2011-10-05 00:59:23 +020012438 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012439}
12440
Alexander Belopolsky40018472011-02-26 01:02:56 +000012441PyObject *
12442PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012443{
12444 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012445
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012446 s = PyUnicode_FromObject(s);
12447 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012448 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 if (sep != NULL) {
12450 sep = PyUnicode_FromObject(sep);
12451 if (sep == NULL) {
12452 Py_DECREF(s);
12453 return NULL;
12454 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012455 }
12456
Victor Stinner9310abb2011-10-05 00:59:23 +020012457 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012458
12459 Py_DECREF(s);
12460 Py_XDECREF(sep);
12461 return result;
12462}
12463
12464PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012465 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012466\n\
12467Return a list of the words in S, using sep as the\n\
12468delimiter string, starting at the end of the string and\n\
12469working to the front. If maxsplit is given, at most maxsplit\n\
12470splits are done. If sep is not specified, any whitespace string\n\
12471is a separator.");
12472
12473static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012474unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012475{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012476 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012477 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012478 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012479
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012480 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12481 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012482 return NULL;
12483
12484 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012485 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012486 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012487 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012488 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012489 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012490}
12491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012492PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012493 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494\n\
12495Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012496Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012497is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498
12499static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012500unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012502 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012503 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012505 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12506 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507 return NULL;
12508
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012509 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510}
12511
12512static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012513PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012515 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516}
12517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012518PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012519 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520\n\
12521Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012522and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523
12524static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012525unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012527 if (PyUnicode_READY(self) == -1)
12528 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012529 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530}
12531
Georg Brandlceee0772007-11-27 23:48:05 +000012532PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012533 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012534\n\
12535Return a translation table usable for str.translate().\n\
12536If there is only one argument, it must be a dictionary mapping Unicode\n\
12537ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012538Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012539If there are two arguments, they must be strings of equal length, and\n\
12540in the resulting dictionary, each character in x will be mapped to the\n\
12541character at the same position in y. If there is a third argument, it\n\
12542must be a string, whose characters will be mapped to None in the result.");
12543
12544static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012545unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012546{
12547 PyObject *x, *y = NULL, *z = NULL;
12548 PyObject *new = NULL, *key, *value;
12549 Py_ssize_t i = 0;
12550 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012551
Georg Brandlceee0772007-11-27 23:48:05 +000012552 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12553 return NULL;
12554 new = PyDict_New();
12555 if (!new)
12556 return NULL;
12557 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 int x_kind, y_kind, z_kind;
12559 void *x_data, *y_data, *z_data;
12560
Georg Brandlceee0772007-11-27 23:48:05 +000012561 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012562 if (!PyUnicode_Check(x)) {
12563 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12564 "be a string if there is a second argument");
12565 goto err;
12566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012568 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12569 "arguments must have equal length");
12570 goto err;
12571 }
12572 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 x_kind = PyUnicode_KIND(x);
12574 y_kind = PyUnicode_KIND(y);
12575 x_data = PyUnicode_DATA(x);
12576 y_data = PyUnicode_DATA(y);
12577 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12578 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012579 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012580 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012581 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012582 if (!value) {
12583 Py_DECREF(key);
12584 goto err;
12585 }
Georg Brandlceee0772007-11-27 23:48:05 +000012586 res = PyDict_SetItem(new, key, value);
12587 Py_DECREF(key);
12588 Py_DECREF(value);
12589 if (res < 0)
12590 goto err;
12591 }
12592 /* create entries for deleting chars in z */
12593 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 z_kind = PyUnicode_KIND(z);
12595 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012596 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012598 if (!key)
12599 goto err;
12600 res = PyDict_SetItem(new, key, Py_None);
12601 Py_DECREF(key);
12602 if (res < 0)
12603 goto err;
12604 }
12605 }
12606 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 int kind;
12608 void *data;
12609
Georg Brandlceee0772007-11-27 23:48:05 +000012610 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012611 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012612 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12613 "to maketrans it must be a dict");
12614 goto err;
12615 }
12616 /* copy entries into the new dict, converting string keys to int keys */
12617 while (PyDict_Next(x, &i, &key, &value)) {
12618 if (PyUnicode_Check(key)) {
12619 /* convert string keys to integer keys */
12620 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012621 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012622 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12623 "table must be of length 1");
12624 goto err;
12625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 kind = PyUnicode_KIND(key);
12627 data = PyUnicode_DATA(key);
12628 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012629 if (!newkey)
12630 goto err;
12631 res = PyDict_SetItem(new, newkey, value);
12632 Py_DECREF(newkey);
12633 if (res < 0)
12634 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012635 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012636 /* just keep integer keys */
12637 if (PyDict_SetItem(new, key, value) < 0)
12638 goto err;
12639 } else {
12640 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12641 "be strings or integers");
12642 goto err;
12643 }
12644 }
12645 }
12646 return new;
12647 err:
12648 Py_DECREF(new);
12649 return NULL;
12650}
12651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012652PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654\n\
12655Return a copy of the string S, where all characters have been mapped\n\
12656through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012657Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012658Unmapped characters are left untouched. Characters mapped to None\n\
12659are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660
12661static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665}
12666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012667PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012670Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671
12672static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012673unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012675 if (PyUnicode_READY(self) == -1)
12676 return NULL;
12677 if (PyUnicode_IS_ASCII(self))
12678 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012679 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680}
12681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012682PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012683 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012685Pad a numeric string S with zeros on the left, to fill a field\n\
12686of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687
12688static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012689unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012691 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012692 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012693 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 int kind;
12695 void *data;
12696 Py_UCS4 chr;
12697
Martin v. Löwis18e16552006-02-15 17:27:45 +000012698 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699 return NULL;
12700
Benjamin Petersonbac79492012-01-14 13:34:47 -050012701 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012702 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703
Victor Stinnerc4b49542011-12-11 22:44:26 +010012704 if (PyUnicode_GET_LENGTH(self) >= width)
12705 return unicode_result_unchanged(self);
12706
12707 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708
12709 u = pad(self, fill, 0, '0');
12710
Walter Dörwald068325e2002-04-15 13:36:47 +000012711 if (u == NULL)
12712 return NULL;
12713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 kind = PyUnicode_KIND(u);
12715 data = PyUnicode_DATA(u);
12716 chr = PyUnicode_READ(kind, data, fill);
12717
12718 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 PyUnicode_WRITE(kind, data, 0, chr);
12721 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 }
12723
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012724 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012725 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727
12728#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012729static PyObject *
12730unicode__decimal2ascii(PyObject *self)
12731{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012733}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734#endif
12735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012736PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012739Return True if S starts with the specified prefix, False otherwise.\n\
12740With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012741With optional end, stop comparing S at that position.\n\
12742prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743
12744static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012745unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012748 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012749 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012750 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012751 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012752 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012753
Jesus Ceaac451502011-04-20 17:09:23 +020012754 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012756 if (PyTuple_Check(subobj)) {
12757 Py_ssize_t i;
12758 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012759 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012760 if (substring == NULL)
12761 return NULL;
12762 result = tailmatch(self, substring, start, end, -1);
12763 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012764 if (result == -1)
12765 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012766 if (result) {
12767 Py_RETURN_TRUE;
12768 }
12769 }
12770 /* nothing matched */
12771 Py_RETURN_FALSE;
12772 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012773 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012774 if (substring == NULL) {
12775 if (PyErr_ExceptionMatches(PyExc_TypeError))
12776 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12777 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012778 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012779 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012780 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012782 if (result == -1)
12783 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012784 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785}
12786
12787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012788PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012791Return True if S ends with the specified suffix, False otherwise.\n\
12792With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012793With optional end, stop comparing S at that position.\n\
12794suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795
12796static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012797unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012800 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012801 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012802 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012803 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012804 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
Jesus Ceaac451502011-04-20 17:09:23 +020012806 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012808 if (PyTuple_Check(subobj)) {
12809 Py_ssize_t i;
12810 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012811 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012812 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012813 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012815 result = tailmatch(self, substring, start, end, +1);
12816 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012817 if (result == -1)
12818 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012819 if (result) {
12820 Py_RETURN_TRUE;
12821 }
12822 }
12823 Py_RETURN_FALSE;
12824 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012825 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012826 if (substring == NULL) {
12827 if (PyErr_ExceptionMatches(PyExc_TypeError))
12828 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12829 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012831 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012832 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010012833 if (result == -1)
12834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012836 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837}
12838
Victor Stinner202fdca2012-05-07 12:47:02 +020012839Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012840_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012841{
Victor Stinner8f674cc2013-04-17 23:02:17 +020012842 if (!writer->readonly)
12843 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12844 else {
12845 /* Copy-on-write mode: set buffer size to 0 so
12846 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
12847 * next write. */
12848 writer->size = 0;
12849 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012850 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12851 writer->data = PyUnicode_DATA(writer->buffer);
12852 writer->kind = PyUnicode_KIND(writer->buffer);
12853}
12854
Victor Stinnerd3f08822012-05-29 12:57:52 +020012855void
Victor Stinner8f674cc2013-04-17 23:02:17 +020012856_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012857{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012858 memset(writer, 0, sizeof(*writer));
12859#ifdef Py_DEBUG
12860 writer->kind = 5; /* invalid kind */
12861#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020012862 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020012863}
12864
Victor Stinnerd3f08822012-05-29 12:57:52 +020012865int
12866_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12867 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012868{
12869 Py_ssize_t newlen;
12870 PyObject *newbuffer;
12871
Victor Stinnerd3f08822012-05-29 12:57:52 +020012872 assert(length > 0);
12873
Victor Stinner202fdca2012-05-07 12:47:02 +020012874 if (length > PY_SSIZE_T_MAX - writer->pos) {
12875 PyErr_NoMemory();
12876 return -1;
12877 }
12878 newlen = writer->pos + length;
12879
Victor Stinner8f674cc2013-04-17 23:02:17 +020012880 maxchar = MAX_MAXCHAR(maxchar, writer->min_char);
12881
Victor Stinnerd3f08822012-05-29 12:57:52 +020012882 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020012883 assert(!writer->readonly);
12884 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012885 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020012886 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012887 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020012888 if (newlen < writer->min_length)
12889 newlen = writer->min_length;
12890
Victor Stinnerd3f08822012-05-29 12:57:52 +020012891 writer->buffer = PyUnicode_New(newlen, maxchar);
12892 if (writer->buffer == NULL)
12893 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012894 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020012895 else if (newlen > writer->size) {
12896 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012897 /* overallocate 25% to limit the number of resize */
Victor Stinner8f674cc2013-04-17 23:02:17 +020012898 newlen += newlen / 4;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012899 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020012900 if (newlen < writer->min_length)
12901 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012902
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012903 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012904 /* resize + widen */
12905 newbuffer = PyUnicode_New(newlen, maxchar);
12906 if (newbuffer == NULL)
12907 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012908 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12909 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012910 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012911 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012912 }
12913 else {
12914 newbuffer = resize_compact(writer->buffer, newlen);
12915 if (newbuffer == NULL)
12916 return -1;
12917 }
12918 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012919 }
12920 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012921 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012922 newbuffer = PyUnicode_New(writer->size, maxchar);
12923 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012924 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012925 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12926 writer->buffer, 0, writer->pos);
12927 Py_DECREF(writer->buffer);
12928 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012929 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020012930 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012931 return 0;
12932}
12933
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020012934Py_LOCAL_INLINE(int)
12935_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020012936{
12937 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
12938 return -1;
12939 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
12940 writer->pos++;
12941 return 0;
12942}
12943
12944int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020012945_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
12946{
12947 return _PyUnicodeWriter_WriteCharInline(writer, ch);
12948}
12949
12950int
Victor Stinnerd3f08822012-05-29 12:57:52 +020012951_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12952{
12953 Py_UCS4 maxchar;
12954 Py_ssize_t len;
12955
12956 if (PyUnicode_READY(str) == -1)
12957 return -1;
12958 len = PyUnicode_GET_LENGTH(str);
12959 if (len == 0)
12960 return 0;
12961 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12962 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012963 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020012964 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012965 Py_INCREF(str);
12966 writer->buffer = str;
12967 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012968 writer->pos += len;
12969 return 0;
12970 }
12971 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12972 return -1;
12973 }
12974 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12975 str, 0, len);
12976 writer->pos += len;
12977 return 0;
12978}
12979
Victor Stinnere215d962012-10-06 23:03:36 +020012980int
Victor Stinnercfc4c132013-04-03 01:48:39 +020012981_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
12982 Py_ssize_t start, Py_ssize_t end)
12983{
12984 Py_UCS4 maxchar;
12985 Py_ssize_t len;
12986
12987 if (PyUnicode_READY(str) == -1)
12988 return -1;
12989
12990 assert(0 <= start);
12991 assert(end <= PyUnicode_GET_LENGTH(str));
12992 assert(start <= end);
12993
12994 if (end == 0)
12995 return 0;
12996
12997 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
12998 return _PyUnicodeWriter_WriteStr(writer, str);
12999
13000 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13001 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13002 else
13003 maxchar = writer->maxchar;
13004 len = end - start;
13005
13006 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13007 return -1;
13008
13009 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13010 str, start, len);
13011 writer->pos += len;
13012 return 0;
13013}
13014
13015int
Victor Stinnere215d962012-10-06 23:03:36 +020013016_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13017{
13018 Py_UCS4 maxchar;
13019
13020 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13021 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13022 return -1;
13023 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13024 writer->pos += len;
13025 return 0;
13026}
13027
Victor Stinnerd3f08822012-05-29 12:57:52 +020013028PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013029_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013030{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013031 if (writer->pos == 0) {
13032 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013033 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013034 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013035 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013036 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
13037 return writer->buffer;
13038 }
13039 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13040 PyObject *newbuffer;
13041 newbuffer = resize_compact(writer->buffer, writer->pos);
13042 if (newbuffer == NULL) {
13043 Py_DECREF(writer->buffer);
13044 return NULL;
13045 }
13046 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013047 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020013048 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010013049 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013050}
13051
Victor Stinnerd3f08822012-05-29 12:57:52 +020013052void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013053_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013054{
13055 Py_CLEAR(writer->buffer);
13056}
13057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013058#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013059
13060PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013061 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013062\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013063Return a formatted version of S, using substitutions from args and kwargs.\n\
13064The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013065
Eric Smith27bbca62010-11-04 17:06:58 +000013066PyDoc_STRVAR(format_map__doc__,
13067 "S.format_map(mapping) -> str\n\
13068\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013069Return a formatted version of S, using substitutions from mapping.\n\
13070The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013071
Eric Smith4a7d76d2008-05-30 18:10:19 +000013072static PyObject *
13073unicode__format__(PyObject* self, PyObject* args)
13074{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013075 PyObject *format_spec;
13076 _PyUnicodeWriter writer;
13077 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013078
13079 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13080 return NULL;
13081
Victor Stinnerd3f08822012-05-29 12:57:52 +020013082 if (PyUnicode_READY(self) == -1)
13083 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013084 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013085 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13086 self, format_spec, 0,
13087 PyUnicode_GET_LENGTH(format_spec));
13088 if (ret == -1) {
13089 _PyUnicodeWriter_Dealloc(&writer);
13090 return NULL;
13091 }
13092 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013093}
13094
Eric Smith8c663262007-08-25 02:26:07 +000013095PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013096 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013097\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013098Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013099
13100static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013101unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 Py_ssize_t size;
13104
13105 /* If it's a compact object, account for base structure +
13106 character data. */
13107 if (PyUnicode_IS_COMPACT_ASCII(v))
13108 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13109 else if (PyUnicode_IS_COMPACT(v))
13110 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013111 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 else {
13113 /* If it is a two-block object, account for base object, and
13114 for character block if present. */
13115 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013116 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013118 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119 }
13120 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013121 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013122 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013124 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013125 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126
13127 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013128}
13129
13130PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013132
13133static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013134unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013135{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013136 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 if (!copy)
13138 return NULL;
13139 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013140}
13141
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013143 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013144 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013145 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13146 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013147 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13148 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013149 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013150 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13151 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13152 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13153 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13154 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013155 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013156 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13157 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13158 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013159 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013160 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13161 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13162 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013163 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013164 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013165 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013166 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013167 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13168 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13169 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13170 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13171 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13172 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13173 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13174 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13175 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13176 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13177 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13178 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13179 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13180 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013181 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013182 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013183 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013184 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013185 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013186 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013187 {"maketrans", (PyCFunction) unicode_maketrans,
13188 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013189 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013190#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013191 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013192 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193#endif
13194
Benjamin Peterson14339b62009-01-31 16:36:08 +000013195 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196 {NULL, NULL}
13197};
13198
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013199static PyObject *
13200unicode_mod(PyObject *v, PyObject *w)
13201{
Brian Curtindfc80e32011-08-10 20:28:54 -050013202 if (!PyUnicode_Check(v))
13203 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013204 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013205}
13206
13207static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013208 0, /*nb_add*/
13209 0, /*nb_subtract*/
13210 0, /*nb_multiply*/
13211 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013212};
13213
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013215 (lenfunc) unicode_length, /* sq_length */
13216 PyUnicode_Concat, /* sq_concat */
13217 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13218 (ssizeargfunc) unicode_getitem, /* sq_item */
13219 0, /* sq_slice */
13220 0, /* sq_ass_item */
13221 0, /* sq_ass_slice */
13222 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223};
13224
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013225static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013226unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228 if (PyUnicode_READY(self) == -1)
13229 return NULL;
13230
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013231 if (PyIndex_Check(item)) {
13232 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013233 if (i == -1 && PyErr_Occurred())
13234 return NULL;
13235 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013237 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013238 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013239 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013240 PyObject *result;
13241 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013242 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013243 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013246 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013247 return NULL;
13248 }
13249
13250 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013251 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013253 slicelength == PyUnicode_GET_LENGTH(self)) {
13254 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013255 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013256 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013257 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013258 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013259 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013260 src_kind = PyUnicode_KIND(self);
13261 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013262 if (!PyUnicode_IS_ASCII(self)) {
13263 kind_limit = kind_maxchar_limit(src_kind);
13264 max_char = 0;
13265 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13266 ch = PyUnicode_READ(src_kind, src_data, cur);
13267 if (ch > max_char) {
13268 max_char = ch;
13269 if (max_char >= kind_limit)
13270 break;
13271 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013272 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013273 }
Victor Stinner55c99112011-10-13 01:17:06 +020013274 else
13275 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013276 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013277 if (result == NULL)
13278 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013279 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013280 dest_data = PyUnicode_DATA(result);
13281
13282 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013283 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13284 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013285 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013286 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013287 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013288 } else {
13289 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13290 return NULL;
13291 }
13292}
13293
13294static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013295 (lenfunc)unicode_length, /* mp_length */
13296 (binaryfunc)unicode_subscript, /* mp_subscript */
13297 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013298};
13299
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301/* Helpers for PyUnicode_Format() */
13302
Victor Stinnera47082312012-10-04 02:19:54 +020013303struct unicode_formatter_t {
13304 PyObject *args;
13305 int args_owned;
13306 Py_ssize_t arglen, argidx;
13307 PyObject *dict;
13308
13309 enum PyUnicode_Kind fmtkind;
13310 Py_ssize_t fmtcnt, fmtpos;
13311 void *fmtdata;
13312 PyObject *fmtstr;
13313
13314 _PyUnicodeWriter writer;
13315};
13316
13317struct unicode_format_arg_t {
13318 Py_UCS4 ch;
13319 int flags;
13320 Py_ssize_t width;
13321 int prec;
13322 int sign;
13323};
13324
Guido van Rossumd57fd912000-03-10 22:53:23 +000013325static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013326unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327{
Victor Stinnera47082312012-10-04 02:19:54 +020013328 Py_ssize_t argidx = ctx->argidx;
13329
13330 if (argidx < ctx->arglen) {
13331 ctx->argidx++;
13332 if (ctx->arglen < 0)
13333 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 else
Victor Stinnera47082312012-10-04 02:19:54 +020013335 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336 }
13337 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013338 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339 return NULL;
13340}
13341
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013342/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343
Victor Stinnera47082312012-10-04 02:19:54 +020013344/* Format a float into the writer if the writer is not NULL, or into *p_output
13345 otherwise.
13346
13347 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013348static int
Victor Stinnera47082312012-10-04 02:19:54 +020013349formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13350 PyObject **p_output,
13351 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013353 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013354 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013355 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013356 int prec;
13357 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013358
Guido van Rossumd57fd912000-03-10 22:53:23 +000013359 x = PyFloat_AsDouble(v);
13360 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013361 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013362
Victor Stinnera47082312012-10-04 02:19:54 +020013363 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013365 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013366
Victor Stinnera47082312012-10-04 02:19:54 +020013367 if (arg->flags & F_ALT)
13368 dtoa_flags = Py_DTSF_ALT;
13369 else
13370 dtoa_flags = 0;
13371 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013372 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013373 return -1;
13374 len = strlen(p);
13375 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013376 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13377 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013378 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013379 }
Victor Stinner184252a2012-06-16 02:57:41 +020013380 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013381 writer->pos += len;
13382 }
13383 else
13384 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013385 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013386 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387}
13388
Victor Stinnerd0880d52012-04-27 23:40:13 +020013389/* formatlong() emulates the format codes d, u, o, x and X, and
13390 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13391 * Python's regular ints.
13392 * Return value: a new PyUnicodeObject*, or NULL if error.
13393 * The output string is of the form
13394 * "-"? ("0x" | "0X")? digit+
13395 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13396 * set in flags. The case of hex digits will be correct,
13397 * There will be at least prec digits, zero-filled on the left if
13398 * necessary to get that many.
13399 * val object to be converted
13400 * flags bitmask of format flags; only F_ALT is looked at
13401 * prec minimum number of digits; 0-fill on left if needed
13402 * type a character in [duoxX]; u acts the same as d
13403 *
13404 * CAUTION: o, x and X conversions on regular ints can never
13405 * produce a '-' sign, but can for Python's unbounded ints.
13406 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013407static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013408formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013409{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013410 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013411 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013412 Py_ssize_t i;
13413 int sign; /* 1 if '-', else 0 */
13414 int len; /* number of characters */
13415 Py_ssize_t llen;
13416 int numdigits; /* len == numnondigits + numdigits */
13417 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013418 int prec = arg->prec;
13419 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013420
Victor Stinnerd0880d52012-04-27 23:40:13 +020013421 /* Avoid exceeding SSIZE_T_MAX */
13422 if (prec > INT_MAX-3) {
13423 PyErr_SetString(PyExc_OverflowError,
13424 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013425 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013426 }
13427
13428 assert(PyLong_Check(val));
13429
13430 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013431 default:
13432 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013433 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013434 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013435 case 'u':
13436 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013437 if (PyBool_Check(val))
13438 result = PyNumber_ToBase(val, 10);
13439 else
13440 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013441 break;
13442 case 'o':
13443 numnondigits = 2;
13444 result = PyNumber_ToBase(val, 8);
13445 break;
13446 case 'x':
13447 case 'X':
13448 numnondigits = 2;
13449 result = PyNumber_ToBase(val, 16);
13450 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013451 }
13452 if (!result)
13453 return NULL;
13454
13455 assert(unicode_modifiable(result));
13456 assert(PyUnicode_IS_READY(result));
13457 assert(PyUnicode_IS_ASCII(result));
13458
13459 /* To modify the string in-place, there can only be one reference. */
13460 if (Py_REFCNT(result) != 1) {
13461 PyErr_BadInternalCall();
13462 return NULL;
13463 }
13464 buf = PyUnicode_DATA(result);
13465 llen = PyUnicode_GET_LENGTH(result);
13466 if (llen > INT_MAX) {
13467 PyErr_SetString(PyExc_ValueError,
13468 "string too large in _PyBytes_FormatLong");
13469 return NULL;
13470 }
13471 len = (int)llen;
13472 sign = buf[0] == '-';
13473 numnondigits += sign;
13474 numdigits = len - numnondigits;
13475 assert(numdigits > 0);
13476
13477 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013478 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013479 (type == 'o' || type == 'x' || type == 'X'))) {
13480 assert(buf[sign] == '0');
13481 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13482 buf[sign+1] == 'o');
13483 numnondigits -= 2;
13484 buf += 2;
13485 len -= 2;
13486 if (sign)
13487 buf[0] = '-';
13488 assert(len == numnondigits + numdigits);
13489 assert(numdigits > 0);
13490 }
13491
13492 /* Fill with leading zeroes to meet minimum width. */
13493 if (prec > numdigits) {
13494 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13495 numnondigits + prec);
13496 char *b1;
13497 if (!r1) {
13498 Py_DECREF(result);
13499 return NULL;
13500 }
13501 b1 = PyBytes_AS_STRING(r1);
13502 for (i = 0; i < numnondigits; ++i)
13503 *b1++ = *buf++;
13504 for (i = 0; i < prec - numdigits; i++)
13505 *b1++ = '0';
13506 for (i = 0; i < numdigits; i++)
13507 *b1++ = *buf++;
13508 *b1 = '\0';
13509 Py_DECREF(result);
13510 result = r1;
13511 buf = PyBytes_AS_STRING(result);
13512 len = numnondigits + prec;
13513 }
13514
13515 /* Fix up case for hex conversions. */
13516 if (type == 'X') {
13517 /* Need to convert all lower case letters to upper case.
13518 and need to convert 0x to 0X (and -0x to -0X). */
13519 for (i = 0; i < len; i++)
13520 if (buf[i] >= 'a' && buf[i] <= 'x')
13521 buf[i] -= 'a'-'A';
13522 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013523 if (!PyUnicode_Check(result)
13524 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013525 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013526 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013527 Py_DECREF(result);
13528 result = unicode;
13529 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013530 else if (len != PyUnicode_GET_LENGTH(result)) {
13531 if (PyUnicode_Resize(&result, len) < 0)
13532 Py_CLEAR(result);
13533 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013534 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013535}
13536
Victor Stinner621ef3d2012-10-02 00:33:47 +020013537/* Format an integer.
13538 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013539 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013540 * -1 and raise an exception on error */
13541static int
Victor Stinnera47082312012-10-04 02:19:54 +020013542mainformatlong(PyObject *v,
13543 struct unicode_format_arg_t *arg,
13544 PyObject **p_output,
13545 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013546{
13547 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013548 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013549
13550 if (!PyNumber_Check(v))
13551 goto wrongtype;
13552
13553 if (!PyLong_Check(v)) {
13554 iobj = PyNumber_Long(v);
13555 if (iobj == NULL) {
13556 if (PyErr_ExceptionMatches(PyExc_TypeError))
13557 goto wrongtype;
13558 return -1;
13559 }
13560 assert(PyLong_Check(iobj));
13561 }
13562 else {
13563 iobj = v;
13564 Py_INCREF(iobj);
13565 }
13566
13567 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013568 && arg->width == -1 && arg->prec == -1
13569 && !(arg->flags & (F_SIGN | F_BLANK))
13570 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013571 {
13572 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013573 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013574 int base;
13575
Victor Stinnera47082312012-10-04 02:19:54 +020013576 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013577 {
13578 default:
13579 assert(0 && "'type' not in [diuoxX]");
13580 case 'd':
13581 case 'i':
13582 case 'u':
13583 base = 10;
13584 break;
13585 case 'o':
13586 base = 8;
13587 break;
13588 case 'x':
13589 case 'X':
13590 base = 16;
13591 break;
13592 }
13593
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013594 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13595 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013596 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013597 }
13598 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013599 return 1;
13600 }
13601
Victor Stinnera47082312012-10-04 02:19:54 +020013602 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013603 Py_DECREF(iobj);
13604 if (res == NULL)
13605 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013606 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013607 return 0;
13608
13609wrongtype:
13610 PyErr_Format(PyExc_TypeError,
13611 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013612 "not %.200s",
13613 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013614 return -1;
13615}
13616
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013617static Py_UCS4
13618formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013619{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013620 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013621 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013622 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013623 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013625 goto onError;
13626 }
13627 else {
13628 /* Integer input truncated to a character */
13629 long x;
13630 x = PyLong_AsLong(v);
13631 if (x == -1 && PyErr_Occurred())
13632 goto onError;
13633
Victor Stinner8faf8212011-12-08 22:14:11 +010013634 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 PyErr_SetString(PyExc_OverflowError,
13636 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013637 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013638 }
13639
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013640 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013641 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013642
Benjamin Peterson29060642009-01-31 22:14:21 +000013643 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013644 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013645 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013646 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013647}
13648
Victor Stinnera47082312012-10-04 02:19:54 +020013649/* Parse options of an argument: flags, width, precision.
13650 Handle also "%(name)" syntax.
13651
13652 Return 0 if the argument has been formatted into arg->str.
13653 Return 1 if the argument has been written into ctx->writer,
13654 Raise an exception and return -1 on error. */
13655static int
13656unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13657 struct unicode_format_arg_t *arg)
13658{
13659#define FORMAT_READ(ctx) \
13660 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13661
13662 PyObject *v;
13663
Victor Stinnera47082312012-10-04 02:19:54 +020013664 if (arg->ch == '(') {
13665 /* Get argument value from a dictionary. Example: "%(name)s". */
13666 Py_ssize_t keystart;
13667 Py_ssize_t keylen;
13668 PyObject *key;
13669 int pcount = 1;
13670
13671 if (ctx->dict == NULL) {
13672 PyErr_SetString(PyExc_TypeError,
13673 "format requires a mapping");
13674 return -1;
13675 }
13676 ++ctx->fmtpos;
13677 --ctx->fmtcnt;
13678 keystart = ctx->fmtpos;
13679 /* Skip over balanced parentheses */
13680 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13681 arg->ch = FORMAT_READ(ctx);
13682 if (arg->ch == ')')
13683 --pcount;
13684 else if (arg->ch == '(')
13685 ++pcount;
13686 ctx->fmtpos++;
13687 }
13688 keylen = ctx->fmtpos - keystart - 1;
13689 if (ctx->fmtcnt < 0 || pcount > 0) {
13690 PyErr_SetString(PyExc_ValueError,
13691 "incomplete format key");
13692 return -1;
13693 }
13694 key = PyUnicode_Substring(ctx->fmtstr,
13695 keystart, keystart + keylen);
13696 if (key == NULL)
13697 return -1;
13698 if (ctx->args_owned) {
13699 Py_DECREF(ctx->args);
13700 ctx->args_owned = 0;
13701 }
13702 ctx->args = PyObject_GetItem(ctx->dict, key);
13703 Py_DECREF(key);
13704 if (ctx->args == NULL)
13705 return -1;
13706 ctx->args_owned = 1;
13707 ctx->arglen = -1;
13708 ctx->argidx = -2;
13709 }
13710
13711 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013712 while (--ctx->fmtcnt >= 0) {
13713 arg->ch = FORMAT_READ(ctx);
13714 ctx->fmtpos++;
13715 switch (arg->ch) {
13716 case '-': arg->flags |= F_LJUST; continue;
13717 case '+': arg->flags |= F_SIGN; continue;
13718 case ' ': arg->flags |= F_BLANK; continue;
13719 case '#': arg->flags |= F_ALT; continue;
13720 case '0': arg->flags |= F_ZERO; continue;
13721 }
13722 break;
13723 }
13724
13725 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013726 if (arg->ch == '*') {
13727 v = unicode_format_getnextarg(ctx);
13728 if (v == NULL)
13729 return -1;
13730 if (!PyLong_Check(v)) {
13731 PyErr_SetString(PyExc_TypeError,
13732 "* wants int");
13733 return -1;
13734 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013735 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013736 if (arg->width == -1 && PyErr_Occurred())
13737 return -1;
13738 if (arg->width < 0) {
13739 arg->flags |= F_LJUST;
13740 arg->width = -arg->width;
13741 }
13742 if (--ctx->fmtcnt >= 0) {
13743 arg->ch = FORMAT_READ(ctx);
13744 ctx->fmtpos++;
13745 }
13746 }
13747 else if (arg->ch >= '0' && arg->ch <= '9') {
13748 arg->width = arg->ch - '0';
13749 while (--ctx->fmtcnt >= 0) {
13750 arg->ch = FORMAT_READ(ctx);
13751 ctx->fmtpos++;
13752 if (arg->ch < '0' || arg->ch > '9')
13753 break;
13754 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13755 mixing signed and unsigned comparison. Since arg->ch is between
13756 '0' and '9', casting to int is safe. */
13757 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13758 PyErr_SetString(PyExc_ValueError,
13759 "width too big");
13760 return -1;
13761 }
13762 arg->width = arg->width*10 + (arg->ch - '0');
13763 }
13764 }
13765
13766 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013767 if (arg->ch == '.') {
13768 arg->prec = 0;
13769 if (--ctx->fmtcnt >= 0) {
13770 arg->ch = FORMAT_READ(ctx);
13771 ctx->fmtpos++;
13772 }
13773 if (arg->ch == '*') {
13774 v = unicode_format_getnextarg(ctx);
13775 if (v == NULL)
13776 return -1;
13777 if (!PyLong_Check(v)) {
13778 PyErr_SetString(PyExc_TypeError,
13779 "* wants int");
13780 return -1;
13781 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013782 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013783 if (arg->prec == -1 && PyErr_Occurred())
13784 return -1;
13785 if (arg->prec < 0)
13786 arg->prec = 0;
13787 if (--ctx->fmtcnt >= 0) {
13788 arg->ch = FORMAT_READ(ctx);
13789 ctx->fmtpos++;
13790 }
13791 }
13792 else if (arg->ch >= '0' && arg->ch <= '9') {
13793 arg->prec = arg->ch - '0';
13794 while (--ctx->fmtcnt >= 0) {
13795 arg->ch = FORMAT_READ(ctx);
13796 ctx->fmtpos++;
13797 if (arg->ch < '0' || arg->ch > '9')
13798 break;
13799 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13800 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013801 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013802 return -1;
13803 }
13804 arg->prec = arg->prec*10 + (arg->ch - '0');
13805 }
13806 }
13807 }
13808
13809 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13810 if (ctx->fmtcnt >= 0) {
13811 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13812 if (--ctx->fmtcnt >= 0) {
13813 arg->ch = FORMAT_READ(ctx);
13814 ctx->fmtpos++;
13815 }
13816 }
13817 }
13818 if (ctx->fmtcnt < 0) {
13819 PyErr_SetString(PyExc_ValueError,
13820 "incomplete format");
13821 return -1;
13822 }
13823 return 0;
13824
13825#undef FORMAT_READ
13826}
13827
13828/* Format one argument. Supported conversion specifiers:
13829
13830 - "s", "r", "a": any type
13831 - "i", "d", "u", "o", "x", "X": int
13832 - "e", "E", "f", "F", "g", "G": float
13833 - "c": int or str (1 character)
13834
Victor Stinner8dbd4212012-12-04 09:30:24 +010013835 When possible, the output is written directly into the Unicode writer
13836 (ctx->writer). A string is created when padding is required.
13837
Victor Stinnera47082312012-10-04 02:19:54 +020013838 Return 0 if the argument has been formatted into *p_str,
13839 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013840 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013841static int
13842unicode_format_arg_format(struct unicode_formatter_t *ctx,
13843 struct unicode_format_arg_t *arg,
13844 PyObject **p_str)
13845{
13846 PyObject *v;
13847 _PyUnicodeWriter *writer = &ctx->writer;
13848
13849 if (ctx->fmtcnt == 0)
13850 ctx->writer.overallocate = 0;
13851
13852 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013853 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020013854 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013855 return 1;
13856 }
13857
13858 v = unicode_format_getnextarg(ctx);
13859 if (v == NULL)
13860 return -1;
13861
Victor Stinnera47082312012-10-04 02:19:54 +020013862
13863 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013864 case 's':
13865 case 'r':
13866 case 'a':
13867 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13868 /* Fast path */
13869 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13870 return -1;
13871 return 1;
13872 }
13873
13874 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13875 *p_str = v;
13876 Py_INCREF(*p_str);
13877 }
13878 else {
13879 if (arg->ch == 's')
13880 *p_str = PyObject_Str(v);
13881 else if (arg->ch == 'r')
13882 *p_str = PyObject_Repr(v);
13883 else
13884 *p_str = PyObject_ASCII(v);
13885 }
13886 break;
13887
13888 case 'i':
13889 case 'd':
13890 case 'u':
13891 case 'o':
13892 case 'x':
13893 case 'X':
13894 {
13895 int ret = mainformatlong(v, arg, p_str, writer);
13896 if (ret != 0)
13897 return ret;
13898 arg->sign = 1;
13899 break;
13900 }
13901
13902 case 'e':
13903 case 'E':
13904 case 'f':
13905 case 'F':
13906 case 'g':
13907 case 'G':
13908 if (arg->width == -1 && arg->prec == -1
13909 && !(arg->flags & (F_SIGN | F_BLANK)))
13910 {
13911 /* Fast path */
13912 if (formatfloat(v, arg, NULL, writer) == -1)
13913 return -1;
13914 return 1;
13915 }
13916
13917 arg->sign = 1;
13918 if (formatfloat(v, arg, p_str, NULL) == -1)
13919 return -1;
13920 break;
13921
13922 case 'c':
13923 {
13924 Py_UCS4 ch = formatchar(v);
13925 if (ch == (Py_UCS4) -1)
13926 return -1;
13927 if (arg->width == -1 && arg->prec == -1) {
13928 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013929 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020013930 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013931 return 1;
13932 }
13933 *p_str = PyUnicode_FromOrdinal(ch);
13934 break;
13935 }
13936
13937 default:
13938 PyErr_Format(PyExc_ValueError,
13939 "unsupported format character '%c' (0x%x) "
13940 "at index %zd",
13941 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13942 (int)arg->ch,
13943 ctx->fmtpos - 1);
13944 return -1;
13945 }
13946 if (*p_str == NULL)
13947 return -1;
13948 assert (PyUnicode_Check(*p_str));
13949 return 0;
13950}
13951
13952static int
13953unicode_format_arg_output(struct unicode_formatter_t *ctx,
13954 struct unicode_format_arg_t *arg,
13955 PyObject *str)
13956{
13957 Py_ssize_t len;
13958 enum PyUnicode_Kind kind;
13959 void *pbuf;
13960 Py_ssize_t pindex;
13961 Py_UCS4 signchar;
13962 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013963 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020013964 Py_ssize_t sublen;
13965 _PyUnicodeWriter *writer = &ctx->writer;
13966 Py_UCS4 fill;
13967
13968 fill = ' ';
13969 if (arg->sign && arg->flags & F_ZERO)
13970 fill = '0';
13971
13972 if (PyUnicode_READY(str) == -1)
13973 return -1;
13974
13975 len = PyUnicode_GET_LENGTH(str);
13976 if ((arg->width == -1 || arg->width <= len)
13977 && (arg->prec == -1 || arg->prec >= len)
13978 && !(arg->flags & (F_SIGN | F_BLANK)))
13979 {
13980 /* Fast path */
13981 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13982 return -1;
13983 return 0;
13984 }
13985
13986 /* Truncate the string for "s", "r" and "a" formats
13987 if the precision is set */
13988 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13989 if (arg->prec >= 0 && len > arg->prec)
13990 len = arg->prec;
13991 }
13992
13993 /* Adjust sign and width */
13994 kind = PyUnicode_KIND(str);
13995 pbuf = PyUnicode_DATA(str);
13996 pindex = 0;
13997 signchar = '\0';
13998 if (arg->sign) {
13999 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14000 if (ch == '-' || ch == '+') {
14001 signchar = ch;
14002 len--;
14003 pindex++;
14004 }
14005 else if (arg->flags & F_SIGN)
14006 signchar = '+';
14007 else if (arg->flags & F_BLANK)
14008 signchar = ' ';
14009 else
14010 arg->sign = 0;
14011 }
14012 if (arg->width < len)
14013 arg->width = len;
14014
14015 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014016 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014017 if (!(arg->flags & F_LJUST)) {
14018 if (arg->sign) {
14019 if ((arg->width-1) > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014020 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014021 }
14022 else {
14023 if (arg->width > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014024 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014025 }
14026 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014027 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14028 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14029 maxchar = MAX_MAXCHAR(maxchar, strmaxchar);
14030 }
14031
Victor Stinnera47082312012-10-04 02:19:54 +020014032 buflen = arg->width;
14033 if (arg->sign && len == arg->width)
14034 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014035 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014036 return -1;
14037
14038 /* Write the sign if needed */
14039 if (arg->sign) {
14040 if (fill != ' ') {
14041 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14042 writer->pos += 1;
14043 }
14044 if (arg->width > len)
14045 arg->width--;
14046 }
14047
14048 /* Write the numeric prefix for "x", "X" and "o" formats
14049 if the alternate form is used.
14050 For example, write "0x" for the "%#x" format. */
14051 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14052 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14053 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14054 if (fill != ' ') {
14055 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14056 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14057 writer->pos += 2;
14058 pindex += 2;
14059 }
14060 arg->width -= 2;
14061 if (arg->width < 0)
14062 arg->width = 0;
14063 len -= 2;
14064 }
14065
14066 /* Pad left with the fill character if needed */
14067 if (arg->width > len && !(arg->flags & F_LJUST)) {
14068 sublen = arg->width - len;
14069 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14070 writer->pos += sublen;
14071 arg->width = len;
14072 }
14073
14074 /* If padding with spaces: write sign if needed and/or numeric prefix if
14075 the alternate form is used */
14076 if (fill == ' ') {
14077 if (arg->sign) {
14078 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14079 writer->pos += 1;
14080 }
14081 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14082 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14083 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14084 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14085 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14086 writer->pos += 2;
14087 pindex += 2;
14088 }
14089 }
14090
14091 /* Write characters */
14092 if (len) {
14093 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14094 str, pindex, len);
14095 writer->pos += len;
14096 }
14097
14098 /* Pad right with the fill character if needed */
14099 if (arg->width > len) {
14100 sublen = arg->width - len;
14101 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14102 writer->pos += sublen;
14103 }
14104 return 0;
14105}
14106
14107/* Helper of PyUnicode_Format(): format one arg.
14108 Return 0 on success, raise an exception and return -1 on error. */
14109static int
14110unicode_format_arg(struct unicode_formatter_t *ctx)
14111{
14112 struct unicode_format_arg_t arg;
14113 PyObject *str;
14114 int ret;
14115
Victor Stinner8dbd4212012-12-04 09:30:24 +010014116 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14117 arg.flags = 0;
14118 arg.width = -1;
14119 arg.prec = -1;
14120 arg.sign = 0;
14121 str = NULL;
14122
Victor Stinnera47082312012-10-04 02:19:54 +020014123 ret = unicode_format_arg_parse(ctx, &arg);
14124 if (ret == -1)
14125 return -1;
14126
14127 ret = unicode_format_arg_format(ctx, &arg, &str);
14128 if (ret == -1)
14129 return -1;
14130
14131 if (ret != 1) {
14132 ret = unicode_format_arg_output(ctx, &arg, str);
14133 Py_DECREF(str);
14134 if (ret == -1)
14135 return -1;
14136 }
14137
14138 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14139 PyErr_SetString(PyExc_TypeError,
14140 "not all arguments converted during string formatting");
14141 return -1;
14142 }
14143 return 0;
14144}
14145
Alexander Belopolsky40018472011-02-26 01:02:56 +000014146PyObject *
14147PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014148{
Victor Stinnera47082312012-10-04 02:19:54 +020014149 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014150
Guido van Rossumd57fd912000-03-10 22:53:23 +000014151 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014152 PyErr_BadInternalCall();
14153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014154 }
Victor Stinnera47082312012-10-04 02:19:54 +020014155
14156 ctx.fmtstr = PyUnicode_FromObject(format);
14157 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014158 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014159 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14160 Py_DECREF(ctx.fmtstr);
14161 return NULL;
14162 }
14163 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14164 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14165 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14166 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014167
Victor Stinner8f674cc2013-04-17 23:02:17 +020014168 _PyUnicodeWriter_Init(&ctx.writer);
14169 ctx.writer.min_length = ctx.fmtcnt + 100;
14170 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014171
Guido van Rossumd57fd912000-03-10 22:53:23 +000014172 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014173 ctx.arglen = PyTuple_Size(args);
14174 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014175 }
14176 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014177 ctx.arglen = -1;
14178 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014179 }
Victor Stinnera47082312012-10-04 02:19:54 +020014180 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014181 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014182 ctx.dict = args;
14183 else
14184 ctx.dict = NULL;
14185 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186
Victor Stinnera47082312012-10-04 02:19:54 +020014187 while (--ctx.fmtcnt >= 0) {
14188 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014189 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014190
14191 nonfmtpos = ctx.fmtpos++;
14192 while (ctx.fmtcnt >= 0 &&
14193 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14194 ctx.fmtpos++;
14195 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014196 }
Victor Stinnera47082312012-10-04 02:19:54 +020014197 if (ctx.fmtcnt < 0) {
14198 ctx.fmtpos--;
14199 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014200 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014201
Victor Stinnercfc4c132013-04-03 01:48:39 +020014202 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14203 nonfmtpos, ctx.fmtpos) < 0)
14204 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014205 }
14206 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014207 ctx.fmtpos++;
14208 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014209 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014210 }
14211 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014212
Victor Stinnera47082312012-10-04 02:19:54 +020014213 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014214 PyErr_SetString(PyExc_TypeError,
14215 "not all arguments converted during string formatting");
14216 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014217 }
14218
Victor Stinnera47082312012-10-04 02:19:54 +020014219 if (ctx.args_owned) {
14220 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014221 }
Victor Stinnera47082312012-10-04 02:19:54 +020014222 Py_DECREF(ctx.fmtstr);
14223 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014224
Benjamin Peterson29060642009-01-31 22:14:21 +000014225 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014226 Py_DECREF(ctx.fmtstr);
14227 _PyUnicodeWriter_Dealloc(&ctx.writer);
14228 if (ctx.args_owned) {
14229 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014230 }
14231 return NULL;
14232}
14233
Jeremy Hylton938ace62002-07-17 16:30:39 +000014234static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014235unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14236
Tim Peters6d6c1a32001-08-02 04:15:00 +000014237static PyObject *
14238unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14239{
Benjamin Peterson29060642009-01-31 22:14:21 +000014240 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014241 static char *kwlist[] = {"object", "encoding", "errors", 0};
14242 char *encoding = NULL;
14243 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014244
Benjamin Peterson14339b62009-01-31 16:36:08 +000014245 if (type != &PyUnicode_Type)
14246 return unicode_subtype_new(type, args, kwds);
14247 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014248 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014249 return NULL;
14250 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014251 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 if (encoding == NULL && errors == NULL)
14253 return PyObject_Str(x);
14254 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014255 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014256}
14257
Guido van Rossume023fe02001-08-30 03:12:59 +000014258static PyObject *
14259unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14260{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014261 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014262 Py_ssize_t length, char_size;
14263 int share_wstr, share_utf8;
14264 unsigned int kind;
14265 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014266
Benjamin Peterson14339b62009-01-31 16:36:08 +000014267 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014268
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014269 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014270 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014272 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014273 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014274 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014275 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014276 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014277
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014278 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014279 if (self == NULL) {
14280 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014281 return NULL;
14282 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014283 kind = PyUnicode_KIND(unicode);
14284 length = PyUnicode_GET_LENGTH(unicode);
14285
14286 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014287#ifdef Py_DEBUG
14288 _PyUnicode_HASH(self) = -1;
14289#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014290 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014291#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014292 _PyUnicode_STATE(self).interned = 0;
14293 _PyUnicode_STATE(self).kind = kind;
14294 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014295 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014296 _PyUnicode_STATE(self).ready = 1;
14297 _PyUnicode_WSTR(self) = NULL;
14298 _PyUnicode_UTF8_LENGTH(self) = 0;
14299 _PyUnicode_UTF8(self) = NULL;
14300 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014301 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014302
14303 share_utf8 = 0;
14304 share_wstr = 0;
14305 if (kind == PyUnicode_1BYTE_KIND) {
14306 char_size = 1;
14307 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14308 share_utf8 = 1;
14309 }
14310 else if (kind == PyUnicode_2BYTE_KIND) {
14311 char_size = 2;
14312 if (sizeof(wchar_t) == 2)
14313 share_wstr = 1;
14314 }
14315 else {
14316 assert(kind == PyUnicode_4BYTE_KIND);
14317 char_size = 4;
14318 if (sizeof(wchar_t) == 4)
14319 share_wstr = 1;
14320 }
14321
14322 /* Ensure we won't overflow the length. */
14323 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14324 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014325 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014327 data = PyObject_MALLOC((length + 1) * char_size);
14328 if (data == NULL) {
14329 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014330 goto onError;
14331 }
14332
Victor Stinnerc3c74152011-10-02 20:39:55 +020014333 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014334 if (share_utf8) {
14335 _PyUnicode_UTF8_LENGTH(self) = length;
14336 _PyUnicode_UTF8(self) = data;
14337 }
14338 if (share_wstr) {
14339 _PyUnicode_WSTR_LENGTH(self) = length;
14340 _PyUnicode_WSTR(self) = (wchar_t *)data;
14341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014342
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014343 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014344 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014345 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014346#ifdef Py_DEBUG
14347 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14348#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014349 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014350 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014351
14352onError:
14353 Py_DECREF(unicode);
14354 Py_DECREF(self);
14355 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014356}
14357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014358PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014359"str(object='') -> str\n\
14360str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014361\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014362Create a new string object from the given object. If encoding or\n\
14363errors is specified, then the object must expose a data buffer\n\
14364that will be decoded using the given encoding and error handler.\n\
14365Otherwise, returns the result of object.__str__() (if defined)\n\
14366or repr(object).\n\
14367encoding defaults to sys.getdefaultencoding().\n\
14368errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014369
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014370static PyObject *unicode_iter(PyObject *seq);
14371
Guido van Rossumd57fd912000-03-10 22:53:23 +000014372PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014373 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 "str", /* tp_name */
14375 sizeof(PyUnicodeObject), /* tp_size */
14376 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014377 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014378 (destructor)unicode_dealloc, /* tp_dealloc */
14379 0, /* tp_print */
14380 0, /* tp_getattr */
14381 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014382 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014383 unicode_repr, /* tp_repr */
14384 &unicode_as_number, /* tp_as_number */
14385 &unicode_as_sequence, /* tp_as_sequence */
14386 &unicode_as_mapping, /* tp_as_mapping */
14387 (hashfunc) unicode_hash, /* tp_hash*/
14388 0, /* tp_call*/
14389 (reprfunc) unicode_str, /* tp_str */
14390 PyObject_GenericGetAttr, /* tp_getattro */
14391 0, /* tp_setattro */
14392 0, /* tp_as_buffer */
14393 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014394 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014395 unicode_doc, /* tp_doc */
14396 0, /* tp_traverse */
14397 0, /* tp_clear */
14398 PyUnicode_RichCompare, /* tp_richcompare */
14399 0, /* tp_weaklistoffset */
14400 unicode_iter, /* tp_iter */
14401 0, /* tp_iternext */
14402 unicode_methods, /* tp_methods */
14403 0, /* tp_members */
14404 0, /* tp_getset */
14405 &PyBaseObject_Type, /* tp_base */
14406 0, /* tp_dict */
14407 0, /* tp_descr_get */
14408 0, /* tp_descr_set */
14409 0, /* tp_dictoffset */
14410 0, /* tp_init */
14411 0, /* tp_alloc */
14412 unicode_new, /* tp_new */
14413 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014414};
14415
14416/* Initialize the Unicode implementation */
14417
Victor Stinner3a50e702011-10-18 21:21:00 +020014418int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014419{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014420 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014421 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014422 0x000A, /* LINE FEED */
14423 0x000D, /* CARRIAGE RETURN */
14424 0x001C, /* FILE SEPARATOR */
14425 0x001D, /* GROUP SEPARATOR */
14426 0x001E, /* RECORD SEPARATOR */
14427 0x0085, /* NEXT LINE */
14428 0x2028, /* LINE SEPARATOR */
14429 0x2029, /* PARAGRAPH SEPARATOR */
14430 };
14431
Fred Drakee4315f52000-05-09 19:53:39 +000014432 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014433 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014434 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014435 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014436 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014437
Guido van Rossumcacfc072002-05-24 19:01:59 +000014438 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014439 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014440
14441 /* initialize the linebreak bloom filter */
14442 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014443 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014444 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014445
14446 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014447
Benjamin Petersonc4311282012-10-30 23:21:10 -040014448 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14449 Py_FatalError("Can't initialize field name iterator type");
14450
14451 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14452 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014453
Victor Stinner3a50e702011-10-18 21:21:00 +020014454#ifdef HAVE_MBCS
14455 winver.dwOSVersionInfoSize = sizeof(winver);
14456 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14457 PyErr_SetFromWindowsErr(0);
14458 return -1;
14459 }
14460#endif
14461 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014462}
14463
14464/* Finalize the Unicode implementation */
14465
Christian Heimesa156e092008-02-16 07:38:31 +000014466int
14467PyUnicode_ClearFreeList(void)
14468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014469 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014470}
14471
Guido van Rossumd57fd912000-03-10 22:53:23 +000014472void
Thomas Wouters78890102000-07-22 19:25:51 +000014473_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014474{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014475 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014476
Serhiy Storchaka05997252013-01-26 12:14:02 +020014477 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014478
Serhiy Storchaka05997252013-01-26 12:14:02 +020014479 for (i = 0; i < 256; i++)
14480 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014481 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014482 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014483}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014484
Walter Dörwald16807132007-05-25 13:52:07 +000014485void
14486PyUnicode_InternInPlace(PyObject **p)
14487{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014488 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014489 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014490#ifdef Py_DEBUG
14491 assert(s != NULL);
14492 assert(_PyUnicode_CHECK(s));
14493#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014494 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014495 return;
14496#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014497 /* If it's a subclass, we don't really know what putting
14498 it in the interned dict might do. */
14499 if (!PyUnicode_CheckExact(s))
14500 return;
14501 if (PyUnicode_CHECK_INTERNED(s))
14502 return;
14503 if (interned == NULL) {
14504 interned = PyDict_New();
14505 if (interned == NULL) {
14506 PyErr_Clear(); /* Don't leave an exception */
14507 return;
14508 }
14509 }
14510 /* It might be that the GetItem call fails even
14511 though the key is present in the dictionary,
14512 namely when this happens during a stack overflow. */
14513 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014514 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014515 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014516
Victor Stinnerf0335102013-04-14 19:13:03 +020014517 if (t) {
14518 Py_INCREF(t);
14519 Py_DECREF(*p);
14520 *p = t;
14521 return;
14522 }
Walter Dörwald16807132007-05-25 13:52:07 +000014523
Benjamin Peterson14339b62009-01-31 16:36:08 +000014524 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014525 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014526 PyErr_Clear();
14527 PyThreadState_GET()->recursion_critical = 0;
14528 return;
14529 }
14530 PyThreadState_GET()->recursion_critical = 0;
14531 /* The two references in interned are not counted by refcnt.
14532 The deallocator will take care of this */
14533 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014534 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014535}
14536
14537void
14538PyUnicode_InternImmortal(PyObject **p)
14539{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014540 PyUnicode_InternInPlace(p);
14541 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014542 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014543 Py_INCREF(*p);
14544 }
Walter Dörwald16807132007-05-25 13:52:07 +000014545}
14546
14547PyObject *
14548PyUnicode_InternFromString(const char *cp)
14549{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014550 PyObject *s = PyUnicode_FromString(cp);
14551 if (s == NULL)
14552 return NULL;
14553 PyUnicode_InternInPlace(&s);
14554 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014555}
14556
Alexander Belopolsky40018472011-02-26 01:02:56 +000014557void
14558_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014559{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014560 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014561 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014562 Py_ssize_t i, n;
14563 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014564
Benjamin Peterson14339b62009-01-31 16:36:08 +000014565 if (interned == NULL || !PyDict_Check(interned))
14566 return;
14567 keys = PyDict_Keys(interned);
14568 if (keys == NULL || !PyList_Check(keys)) {
14569 PyErr_Clear();
14570 return;
14571 }
Walter Dörwald16807132007-05-25 13:52:07 +000014572
Benjamin Peterson14339b62009-01-31 16:36:08 +000014573 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14574 detector, interned unicode strings are not forcibly deallocated;
14575 rather, we give them their stolen references back, and then clear
14576 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014577
Benjamin Peterson14339b62009-01-31 16:36:08 +000014578 n = PyList_GET_SIZE(keys);
14579 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014580 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014581 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014582 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014583 if (PyUnicode_READY(s) == -1) {
14584 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014585 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014587 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014588 case SSTATE_NOT_INTERNED:
14589 /* XXX Shouldn't happen */
14590 break;
14591 case SSTATE_INTERNED_IMMORTAL:
14592 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014593 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014594 break;
14595 case SSTATE_INTERNED_MORTAL:
14596 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014597 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014598 break;
14599 default:
14600 Py_FatalError("Inconsistent interned string state.");
14601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014602 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014603 }
14604 fprintf(stderr, "total size of all interned strings: "
14605 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14606 "mortal/immortal\n", mortal_size, immortal_size);
14607 Py_DECREF(keys);
14608 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014609 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014610}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014611
14612
14613/********************* Unicode Iterator **************************/
14614
14615typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014616 PyObject_HEAD
14617 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014618 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014619} unicodeiterobject;
14620
14621static void
14622unicodeiter_dealloc(unicodeiterobject *it)
14623{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014624 _PyObject_GC_UNTRACK(it);
14625 Py_XDECREF(it->it_seq);
14626 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014627}
14628
14629static int
14630unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14631{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014632 Py_VISIT(it->it_seq);
14633 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014634}
14635
14636static PyObject *
14637unicodeiter_next(unicodeiterobject *it)
14638{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014639 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014640
Benjamin Peterson14339b62009-01-31 16:36:08 +000014641 assert(it != NULL);
14642 seq = it->it_seq;
14643 if (seq == NULL)
14644 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014645 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014647 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14648 int kind = PyUnicode_KIND(seq);
14649 void *data = PyUnicode_DATA(seq);
14650 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14651 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014652 if (item != NULL)
14653 ++it->it_index;
14654 return item;
14655 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014656
Benjamin Peterson14339b62009-01-31 16:36:08 +000014657 Py_DECREF(seq);
14658 it->it_seq = NULL;
14659 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014660}
14661
14662static PyObject *
14663unicodeiter_len(unicodeiterobject *it)
14664{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014665 Py_ssize_t len = 0;
14666 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014667 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014668 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014669}
14670
14671PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14672
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014673static PyObject *
14674unicodeiter_reduce(unicodeiterobject *it)
14675{
14676 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014677 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014678 it->it_seq, it->it_index);
14679 } else {
14680 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14681 if (u == NULL)
14682 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014683 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014684 }
14685}
14686
14687PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14688
14689static PyObject *
14690unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14691{
14692 Py_ssize_t index = PyLong_AsSsize_t(state);
14693 if (index == -1 && PyErr_Occurred())
14694 return NULL;
14695 if (index < 0)
14696 index = 0;
14697 it->it_index = index;
14698 Py_RETURN_NONE;
14699}
14700
14701PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14702
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014703static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014704 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014705 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014706 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14707 reduce_doc},
14708 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14709 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014710 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014711};
14712
14713PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014714 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14715 "str_iterator", /* tp_name */
14716 sizeof(unicodeiterobject), /* tp_basicsize */
14717 0, /* tp_itemsize */
14718 /* methods */
14719 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14720 0, /* tp_print */
14721 0, /* tp_getattr */
14722 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014723 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014724 0, /* tp_repr */
14725 0, /* tp_as_number */
14726 0, /* tp_as_sequence */
14727 0, /* tp_as_mapping */
14728 0, /* tp_hash */
14729 0, /* tp_call */
14730 0, /* tp_str */
14731 PyObject_GenericGetAttr, /* tp_getattro */
14732 0, /* tp_setattro */
14733 0, /* tp_as_buffer */
14734 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14735 0, /* tp_doc */
14736 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14737 0, /* tp_clear */
14738 0, /* tp_richcompare */
14739 0, /* tp_weaklistoffset */
14740 PyObject_SelfIter, /* tp_iter */
14741 (iternextfunc)unicodeiter_next, /* tp_iternext */
14742 unicodeiter_methods, /* tp_methods */
14743 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014744};
14745
14746static PyObject *
14747unicode_iter(PyObject *seq)
14748{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014749 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014750
Benjamin Peterson14339b62009-01-31 16:36:08 +000014751 if (!PyUnicode_Check(seq)) {
14752 PyErr_BadInternalCall();
14753 return NULL;
14754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014755 if (PyUnicode_READY(seq) == -1)
14756 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014757 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14758 if (it == NULL)
14759 return NULL;
14760 it->it_index = 0;
14761 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014762 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014763 _PyObject_GC_TRACK(it);
14764 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014765}
14766
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014767
14768size_t
14769Py_UNICODE_strlen(const Py_UNICODE *u)
14770{
14771 int res = 0;
14772 while(*u++)
14773 res++;
14774 return res;
14775}
14776
14777Py_UNICODE*
14778Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14779{
14780 Py_UNICODE *u = s1;
14781 while ((*u++ = *s2++));
14782 return s1;
14783}
14784
14785Py_UNICODE*
14786Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14787{
14788 Py_UNICODE *u = s1;
14789 while ((*u++ = *s2++))
14790 if (n-- == 0)
14791 break;
14792 return s1;
14793}
14794
14795Py_UNICODE*
14796Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14797{
14798 Py_UNICODE *u1 = s1;
14799 u1 += Py_UNICODE_strlen(u1);
14800 Py_UNICODE_strcpy(u1, s2);
14801 return s1;
14802}
14803
14804int
14805Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14806{
14807 while (*s1 && *s2 && *s1 == *s2)
14808 s1++, s2++;
14809 if (*s1 && *s2)
14810 return (*s1 < *s2) ? -1 : +1;
14811 if (*s1)
14812 return 1;
14813 if (*s2)
14814 return -1;
14815 return 0;
14816}
14817
14818int
14819Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14820{
14821 register Py_UNICODE u1, u2;
14822 for (; n != 0; n--) {
14823 u1 = *s1;
14824 u2 = *s2;
14825 if (u1 != u2)
14826 return (u1 < u2) ? -1 : +1;
14827 if (u1 == '\0')
14828 return 0;
14829 s1++;
14830 s2++;
14831 }
14832 return 0;
14833}
14834
14835Py_UNICODE*
14836Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14837{
14838 const Py_UNICODE *p;
14839 for (p = s; *p; p++)
14840 if (*p == c)
14841 return (Py_UNICODE*)p;
14842 return NULL;
14843}
14844
14845Py_UNICODE*
14846Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14847{
14848 const Py_UNICODE *p;
14849 p = s + Py_UNICODE_strlen(s);
14850 while (p != s) {
14851 p--;
14852 if (*p == c)
14853 return (Py_UNICODE*)p;
14854 }
14855 return NULL;
14856}
Victor Stinner331ea922010-08-10 16:37:20 +000014857
Victor Stinner71133ff2010-09-01 23:43:53 +000014858Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014859PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014860{
Victor Stinner577db2c2011-10-11 22:12:48 +020014861 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014862 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014864 if (!PyUnicode_Check(unicode)) {
14865 PyErr_BadArgument();
14866 return NULL;
14867 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014868 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014869 if (u == NULL)
14870 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014871 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014872 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014873 PyErr_NoMemory();
14874 return NULL;
14875 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014876 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014877 size *= sizeof(Py_UNICODE);
14878 copy = PyMem_Malloc(size);
14879 if (copy == NULL) {
14880 PyErr_NoMemory();
14881 return NULL;
14882 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014883 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014884 return copy;
14885}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014886
Georg Brandl66c221e2010-10-14 07:04:07 +000014887/* A _string module, to export formatter_parser and formatter_field_name_split
14888 to the string.Formatter class implemented in Python. */
14889
14890static PyMethodDef _string_methods[] = {
14891 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14892 METH_O, PyDoc_STR("split the argument as a field name")},
14893 {"formatter_parser", (PyCFunction) formatter_parser,
14894 METH_O, PyDoc_STR("parse the argument as a format string")},
14895 {NULL, NULL}
14896};
14897
14898static struct PyModuleDef _string_module = {
14899 PyModuleDef_HEAD_INIT,
14900 "_string",
14901 PyDoc_STR("string helper module"),
14902 0,
14903 _string_methods,
14904 NULL,
14905 NULL,
14906 NULL,
14907 NULL
14908};
14909
14910PyMODINIT_FUNC
14911PyInit__string(void)
14912{
14913 return PyModule_Create(&_string_module);
14914}
14915
14916
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014917#ifdef __cplusplus
14918}
14919#endif