blob: cd2acc0f83547abdae7dac25c8c584edf9fb5ea8 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinnere6abb482012-05-02 01:15:40 +0200107/* Optimized version of Py_MAX() to compute the maximum character:
108 use it when your are computing the second argument of PyUnicode_New() */
109#define MAX_MAXCHAR(maxchar1, maxchar2) \
110 ((maxchar1) | (maxchar2))
111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (!PyUnicode_IS_COMPACT_ASCII(op) \
132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (_PyUnicode_WSTR(op) && \
140 (!PyUnicode_IS_READY(op) || \
141 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
142
Victor Stinner910337b2011-10-03 03:20:16 +0200143/* Generic helper macro to convert characters of different types.
144 from_type and to_type have to be valid type names, begin and end
145 are pointers to the source characters which should be of type
146 "from_type *". to is a pointer of type "to_type *" and points to the
147 buffer where the result characters are written to. */
148#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
149 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200150 to_type *_to = (to_type *) to; \
151 const from_type *_iter = (begin); \
152 const from_type *_end = (end); \
153 Py_ssize_t n = (_end) - (_iter); \
154 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200155 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_unrolled_end)) { \
157 _to[0] = (to_type) _iter[0]; \
158 _to[1] = (to_type) _iter[1]; \
159 _to[2] = (to_type) _iter[2]; \
160 _to[3] = (to_type) _iter[3]; \
161 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200162 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_end)) \
164 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166
Walter Dörwald16807132007-05-25 13:52:07 +0000167/* This dictionary holds all interned unicode strings. Note that references
168 to strings in this dictionary are *not* counted in the string's ob_refcnt.
169 When the interned string reaches a refcnt of 0 the string deallocation
170 function will delete the reference from this dictionary.
171
172 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000173 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000174*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200175static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000177/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179
Serhiy Storchaka678db842013-01-26 12:16:36 +0200180#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200181 do { \
182 if (unicode_empty != NULL) \
183 Py_INCREF(unicode_empty); \
184 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185 unicode_empty = PyUnicode_New(0, 0); \
186 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
189 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200191 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193#define _Py_RETURN_UNICODE_EMPTY() \
194 do { \
195 _Py_INCREF_UNICODE_EMPTY(); \
196 return unicode_empty; \
197 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200199/* Forward declaration */
200Py_LOCAL_INLINE(int)
201_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
202
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200205
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206/* Single character Unicode strings in the Latin-1 range are being
207 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200208static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210/* Fast detection of the most frequent whitespace characters */
211const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000215/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000216/* case 0x000C: * FORM FEED */
217/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 1, 1, 1, 1, 1, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000220/* case 0x001C: * FILE SEPARATOR */
221/* case 0x001D: * GROUP SEPARATOR */
222/* case 0x001E: * RECORD SEPARATOR */
223/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000226 1, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000230
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000239};
240
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200243static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100244static int unicode_modifiable(PyObject *unicode);
245
Victor Stinnerfe226c02011-10-03 03:52:20 +0200246
Alexander Belopolsky40018472011-02-26 01:02:56 +0000247static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100248_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200249static PyObject *
250_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
251static PyObject *
252_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
253
254static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000255unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100257 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000258 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static void
261raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300262 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100263 PyObject *unicode,
264 Py_ssize_t startpos, Py_ssize_t endpos,
265 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000266
Christian Heimes190d79e2008-01-30 11:58:22 +0000267/* Same for linebreaks */
268static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000271/* 0x000B, * LINE TABULATION */
272/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000273/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000274 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000276/* 0x001C, * FILE SEPARATOR */
277/* 0x001D, * GROUP SEPARATOR */
278/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 1, 1, 1, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000284
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000293};
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Victor Stinner3a50e702011-10-18 21:21:00 +0200524#ifdef HAVE_MBCS
525static OSVERSIONINFOEX winver;
526#endif
527
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528/* --- Bloom Filters ----------------------------------------------------- */
529
530/* stuff to implement simple "bloom filters" for Unicode characters.
531 to keep things simple, we use a single bitmask, using the least 5
532 bits from each unicode characters as the bit index. */
533
534/* the linebreak mask is set up by Unicode_Init below */
535
Antoine Pitrouf068f942010-01-13 14:19:12 +0000536#if LONG_BIT >= 128
537#define BLOOM_WIDTH 128
538#elif LONG_BIT >= 64
539#define BLOOM_WIDTH 64
540#elif LONG_BIT >= 32
541#define BLOOM_WIDTH 32
542#else
543#error "LONG_BIT is smaller than 32"
544#endif
545
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546#define BLOOM_MASK unsigned long
547
Serhiy Storchaka05997252013-01-26 12:14:02 +0200548static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Antoine Pitrouf068f942010-01-13 14:19:12 +0000550#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Benjamin Peterson29060642009-01-31 22:14:21 +0000552#define BLOOM_LINEBREAK(ch) \
553 ((ch) < 128U ? ascii_linebreak[(ch)] : \
554 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Alexander Belopolsky40018472011-02-26 01:02:56 +0000556Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558{
Victor Stinnera85af502013-04-09 21:53:54 +0200559#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
560 do { \
561 TYPE *data = (TYPE *)PTR; \
562 TYPE *end = data + LEN; \
563 Py_UCS4 ch; \
564 for (; data != end; data++) { \
565 ch = *data; \
566 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
567 } \
568 break; \
569 } while (0)
570
Thomas Wouters477c8d52006-05-27 19:21:47 +0000571 /* calculate simple bloom-style bitmask for a given unicode string */
572
Antoine Pitrouf068f942010-01-13 14:19:12 +0000573 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000574
575 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200576 switch (kind) {
577 case PyUnicode_1BYTE_KIND:
578 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
579 break;
580 case PyUnicode_2BYTE_KIND:
581 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
582 break;
583 case PyUnicode_4BYTE_KIND:
584 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
585 break;
586 default:
587 assert(0);
588 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000589 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200590
591#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000592}
593
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200594/* Compilation of templated routines */
595
596#include "stringlib/asciilib.h"
597#include "stringlib/fastsearch.h"
598#include "stringlib/partition.h"
599#include "stringlib/split.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
602#include "stringlib/find_max_char.h"
603#include "stringlib/localeutil.h"
604#include "stringlib/undef.h"
605
606#include "stringlib/ucs1lib.h"
607#include "stringlib/fastsearch.h"
608#include "stringlib/partition.h"
609#include "stringlib/split.h"
610#include "stringlib/count.h"
611#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300612#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200613#include "stringlib/find_max_char.h"
614#include "stringlib/localeutil.h"
615#include "stringlib/undef.h"
616
617#include "stringlib/ucs2lib.h"
618#include "stringlib/fastsearch.h"
619#include "stringlib/partition.h"
620#include "stringlib/split.h"
621#include "stringlib/count.h"
622#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300623#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200624#include "stringlib/find_max_char.h"
625#include "stringlib/localeutil.h"
626#include "stringlib/undef.h"
627
628#include "stringlib/ucs4lib.h"
629#include "stringlib/fastsearch.h"
630#include "stringlib/partition.h"
631#include "stringlib/split.h"
632#include "stringlib/count.h"
633#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300634#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200635#include "stringlib/find_max_char.h"
636#include "stringlib/localeutil.h"
637#include "stringlib/undef.h"
638
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200639#include "stringlib/unicodedefs.h"
640#include "stringlib/fastsearch.h"
641#include "stringlib/count.h"
642#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100643#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200644
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645/* --- Unicode Object ----------------------------------------------------- */
646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200648fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
651 Py_ssize_t size, Py_UCS4 ch,
652 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200653{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200654 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
655
656 switch (kind) {
657 case PyUnicode_1BYTE_KIND:
658 {
659 Py_UCS1 ch1 = (Py_UCS1) ch;
660 if (ch1 == ch)
661 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
662 else
663 return -1;
664 }
665 case PyUnicode_2BYTE_KIND:
666 {
667 Py_UCS2 ch2 = (Py_UCS2) ch;
668 if (ch2 == ch)
669 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
670 else
671 return -1;
672 }
673 case PyUnicode_4BYTE_KIND:
674 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
675 default:
676 assert(0);
677 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679}
680
Victor Stinnerafffce42012-10-03 23:03:17 +0200681#ifdef Py_DEBUG
682/* Fill the data of an Unicode string with invalid characters to detect bugs
683 earlier.
684
685 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
686 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
687 invalid character in Unicode 6.0. */
688static void
689unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
690{
691 int kind = PyUnicode_KIND(unicode);
692 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
693 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
694 if (length <= old_length)
695 return;
696 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
697}
698#endif
699
Victor Stinnerfe226c02011-10-03 03:52:20 +0200700static PyObject*
701resize_compact(PyObject *unicode, Py_ssize_t length)
702{
703 Py_ssize_t char_size;
704 Py_ssize_t struct_size;
705 Py_ssize_t new_size;
706 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100707 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200708#ifdef Py_DEBUG
709 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
710#endif
711
Victor Stinner79891572012-05-03 13:43:07 +0200712 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100714 assert(PyUnicode_IS_COMPACT(unicode));
715
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200716 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100717 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 struct_size = sizeof(PyASCIIObject);
719 else
720 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200721 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
724 PyErr_NoMemory();
725 return NULL;
726 }
727 new_size = (struct_size + (length + 1) * char_size);
728
Victor Stinner84def372011-12-11 20:04:56 +0100729 _Py_DEC_REFTOTAL;
730 _Py_ForgetReference(unicode);
731
732 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
733 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100734 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 PyErr_NoMemory();
736 return NULL;
737 }
Victor Stinner84def372011-12-11 20:04:56 +0100738 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100740
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200742 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100744 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200745 _PyUnicode_WSTR_LENGTH(unicode) = length;
746 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100747 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
748 PyObject_DEL(_PyUnicode_WSTR(unicode));
749 _PyUnicode_WSTR(unicode) = NULL;
750 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200751#ifdef Py_DEBUG
752 unicode_fill_invalid(unicode, old_length);
753#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200754 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
755 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200757 return unicode;
758}
759
Alexander Belopolsky40018472011-02-26 01:02:56 +0000760static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200761resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762{
Victor Stinner95663112011-10-04 01:03:50 +0200763 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200765 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000767
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768 if (PyUnicode_IS_READY(unicode)) {
769 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200770 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200772#ifdef Py_DEBUG
773 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
774#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775
776 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200777 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200778 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
779 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780
781 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
782 PyErr_NoMemory();
783 return -1;
784 }
785 new_size = (length + 1) * char_size;
786
Victor Stinner7a9105a2011-12-12 00:13:42 +0100787 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
788 {
789 PyObject_DEL(_PyUnicode_UTF8(unicode));
790 _PyUnicode_UTF8(unicode) = NULL;
791 _PyUnicode_UTF8_LENGTH(unicode) = 0;
792 }
793
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 data = (PyObject *)PyObject_REALLOC(data, new_size);
795 if (data == NULL) {
796 PyErr_NoMemory();
797 return -1;
798 }
799 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200801 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200802 _PyUnicode_WSTR_LENGTH(unicode) = length;
803 }
804 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200805 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200806 _PyUnicode_UTF8_LENGTH(unicode) = length;
807 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808 _PyUnicode_LENGTH(unicode) = length;
809 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200810#ifdef Py_DEBUG
811 unicode_fill_invalid(unicode, old_length);
812#endif
Victor Stinner95663112011-10-04 01:03:50 +0200813 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200814 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200816 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200817 }
Victor Stinner95663112011-10-04 01:03:50 +0200818 assert(_PyUnicode_WSTR(unicode) != NULL);
819
820 /* check for integer overflow */
821 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
822 PyErr_NoMemory();
823 return -1;
824 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200826 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100827 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200828 if (!wstr) {
829 PyErr_NoMemory();
830 return -1;
831 }
832 _PyUnicode_WSTR(unicode) = wstr;
833 _PyUnicode_WSTR(unicode)[length] = 0;
834 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200835 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 return 0;
837}
838
Victor Stinnerfe226c02011-10-03 03:52:20 +0200839static PyObject*
840resize_copy(PyObject *unicode, Py_ssize_t length)
841{
842 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200844 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845
Benjamin Petersonbac79492012-01-14 13:34:47 -0500846 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100847 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200848
849 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
850 if (copy == NULL)
851 return NULL;
852
853 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200854 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200855 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200856 }
857 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100859
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200860 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200861 if (w == NULL)
862 return NULL;
863 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
864 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200865 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
866 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200867 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200868 }
869}
870
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000872 Ux0000 terminated; some code (e.g. new_identifier)
873 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874
875 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000876 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877
878*/
879
Alexander Belopolsky40018472011-02-26 01:02:56 +0000880static PyUnicodeObject *
881_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882{
883 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885
Thomas Wouters477c8d52006-05-27 19:21:47 +0000886 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000887 if (length == 0 && unicode_empty != NULL) {
888 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200889 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 }
891
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000892 /* Ensure we won't overflow the size. */
893 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
894 return (PyUnicodeObject *)PyErr_NoMemory();
895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896 if (length < 0) {
897 PyErr_SetString(PyExc_SystemError,
898 "Negative size passed to _PyUnicode_New");
899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900 }
901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
903 if (unicode == NULL)
904 return NULL;
905 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
906 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
907 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100908 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000909 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100910 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912
Jeremy Hyltond8082792003-09-16 19:41:39 +0000913 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000914 * the caller fails before initializing str -- unicode_resize()
915 * reads str[0], and the Keep-Alive optimization can keep memory
916 * allocated for str alive across a call to unicode_dealloc(unicode).
917 * We don't want unicode_resize to read uninitialized memory in
918 * that case.
919 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 _PyUnicode_WSTR(unicode)[0] = 0;
921 _PyUnicode_WSTR(unicode)[length] = 0;
922 _PyUnicode_WSTR_LENGTH(unicode) = length;
923 _PyUnicode_HASH(unicode) = -1;
924 _PyUnicode_STATE(unicode).interned = 0;
925 _PyUnicode_STATE(unicode).kind = 0;
926 _PyUnicode_STATE(unicode).compact = 0;
927 _PyUnicode_STATE(unicode).ready = 0;
928 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200929 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200930 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200931 _PyUnicode_UTF8(unicode) = NULL;
932 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100933 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 return unicode;
935}
936
Victor Stinnerf42dc442011-10-02 23:33:16 +0200937static const char*
938unicode_kind_name(PyObject *unicode)
939{
Victor Stinner42dfd712011-10-03 14:41:45 +0200940 /* don't check consistency: unicode_kind_name() is called from
941 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200942 if (!PyUnicode_IS_COMPACT(unicode))
943 {
944 if (!PyUnicode_IS_READY(unicode))
945 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600946 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200947 {
948 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200949 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200950 return "legacy ascii";
951 else
952 return "legacy latin1";
953 case PyUnicode_2BYTE_KIND:
954 return "legacy UCS2";
955 case PyUnicode_4BYTE_KIND:
956 return "legacy UCS4";
957 default:
958 return "<legacy invalid kind>";
959 }
960 }
961 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600962 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200963 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200965 return "ascii";
966 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200967 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200968 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200969 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200971 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200972 default:
973 return "<invalid compact kind>";
974 }
975}
976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978/* Functions wrapping macros for use in debugger */
979char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200980 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981}
982
983void *_PyUnicode_compact_data(void *unicode) {
984 return _PyUnicode_COMPACT_DATA(unicode);
985}
986void *_PyUnicode_data(void *unicode){
987 printf("obj %p\n", unicode);
988 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
989 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
990 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
991 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
992 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
993 return PyUnicode_DATA(unicode);
994}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200995
996void
997_PyUnicode_Dump(PyObject *op)
998{
999 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001000 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1001 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1002 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001003
Victor Stinnera849a4b2011-10-03 12:12:11 +02001004 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001005 {
1006 if (ascii->state.ascii)
1007 data = (ascii + 1);
1008 else
1009 data = (compact + 1);
1010 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001011 else
1012 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001013 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1014
Victor Stinnera849a4b2011-10-03 12:12:11 +02001015 if (ascii->wstr == data)
1016 printf("shared ");
1017 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001018
Victor Stinnera3b334d2011-10-03 13:53:37 +02001019 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001020 printf(" (%zu), ", compact->wstr_length);
1021 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1022 printf("shared ");
1023 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001024 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001025 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001026}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027#endif
1028
1029PyObject *
1030PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1031{
1032 PyObject *obj;
1033 PyCompactUnicodeObject *unicode;
1034 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001035 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001036 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 Py_ssize_t char_size;
1038 Py_ssize_t struct_size;
1039
1040 /* Optimization for empty strings */
1041 if (size == 0 && unicode_empty != NULL) {
1042 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001043 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 }
1045
Victor Stinner9e9d6892011-10-04 01:02:02 +02001046 is_ascii = 0;
1047 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 struct_size = sizeof(PyCompactUnicodeObject);
1049 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001050 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 char_size = 1;
1052 is_ascii = 1;
1053 struct_size = sizeof(PyASCIIObject);
1054 }
1055 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001056 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 char_size = 1;
1058 }
1059 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001060 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 char_size = 2;
1062 if (sizeof(wchar_t) == 2)
1063 is_sharing = 1;
1064 }
1065 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001066 if (maxchar > MAX_UNICODE) {
1067 PyErr_SetString(PyExc_SystemError,
1068 "invalid maximum character passed to PyUnicode_New");
1069 return NULL;
1070 }
Victor Stinner8f825062012-04-27 13:55:39 +02001071 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 char_size = 4;
1073 if (sizeof(wchar_t) == 4)
1074 is_sharing = 1;
1075 }
1076
1077 /* Ensure we won't overflow the size. */
1078 if (size < 0) {
1079 PyErr_SetString(PyExc_SystemError,
1080 "Negative size passed to PyUnicode_New");
1081 return NULL;
1082 }
1083 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1084 return PyErr_NoMemory();
1085
1086 /* Duplicated allocation code from _PyObject_New() instead of a call to
1087 * PyObject_New() so we are able to allocate space for the object and
1088 * it's data buffer.
1089 */
1090 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1091 if (obj == NULL)
1092 return PyErr_NoMemory();
1093 obj = PyObject_INIT(obj, &PyUnicode_Type);
1094 if (obj == NULL)
1095 return NULL;
1096
1097 unicode = (PyCompactUnicodeObject *)obj;
1098 if (is_ascii)
1099 data = ((PyASCIIObject*)obj) + 1;
1100 else
1101 data = unicode + 1;
1102 _PyUnicode_LENGTH(unicode) = size;
1103 _PyUnicode_HASH(unicode) = -1;
1104 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001105 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 _PyUnicode_STATE(unicode).compact = 1;
1107 _PyUnicode_STATE(unicode).ready = 1;
1108 _PyUnicode_STATE(unicode).ascii = is_ascii;
1109 if (is_ascii) {
1110 ((char*)data)[size] = 0;
1111 _PyUnicode_WSTR(unicode) = NULL;
1112 }
Victor Stinner8f825062012-04-27 13:55:39 +02001113 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 ((char*)data)[size] = 0;
1115 _PyUnicode_WSTR(unicode) = NULL;
1116 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001118 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 else {
1121 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001122 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001123 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001125 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 ((Py_UCS4*)data)[size] = 0;
1127 if (is_sharing) {
1128 _PyUnicode_WSTR_LENGTH(unicode) = size;
1129 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1130 }
1131 else {
1132 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1133 _PyUnicode_WSTR(unicode) = NULL;
1134 }
1135 }
Victor Stinner8f825062012-04-27 13:55:39 +02001136#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001137 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001138#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001139 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140 return obj;
1141}
1142
1143#if SIZEOF_WCHAR_T == 2
1144/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1145 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001146 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
1148 This function assumes that unicode can hold one more code point than wstr
1149 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001150static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001152 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153{
1154 const wchar_t *iter;
1155 Py_UCS4 *ucs4_out;
1156
Victor Stinner910337b2011-10-03 03:20:16 +02001157 assert(unicode != NULL);
1158 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1160 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1161
1162 for (iter = begin; iter < end; ) {
1163 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1164 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001165 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1166 && (iter+1) < end
1167 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 {
Victor Stinner551ac952011-11-29 22:58:13 +01001169 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170 iter += 2;
1171 }
1172 else {
1173 *ucs4_out++ = *iter;
1174 iter++;
1175 }
1176 }
1177 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1178 _PyUnicode_GET_LENGTH(unicode)));
1179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180}
1181#endif
1182
Victor Stinnercd9950f2011-10-02 00:34:53 +02001183static int
Victor Stinner488fa492011-12-12 00:01:39 +01001184unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001185{
Victor Stinner488fa492011-12-12 00:01:39 +01001186 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001187 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001188 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001189 return -1;
1190 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001191 return 0;
1192}
1193
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001194static int
1195_copy_characters(PyObject *to, Py_ssize_t to_start,
1196 PyObject *from, Py_ssize_t from_start,
1197 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 unsigned int from_kind, to_kind;
1200 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201
Victor Stinneree4544c2012-05-09 22:24:08 +02001202 assert(0 <= how_many);
1203 assert(0 <= from_start);
1204 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001205 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001207 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208
Victor Stinnerd3f08822012-05-29 12:57:52 +02001209 assert(PyUnicode_Check(to));
1210 assert(PyUnicode_IS_READY(to));
1211 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1212
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001213 if (how_many == 0)
1214 return 0;
1215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001217 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220
Victor Stinnerf1852262012-06-16 16:38:26 +02001221#ifdef Py_DEBUG
1222 if (!check_maxchar
1223 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1224 {
1225 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1226 Py_UCS4 ch;
1227 Py_ssize_t i;
1228 for (i=0; i < how_many; i++) {
1229 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1230 assert(ch <= to_maxchar);
1231 }
1232 }
1233#endif
1234
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001236 if (check_maxchar
1237 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1238 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001239 /* Writing Latin-1 characters into an ASCII string requires to
1240 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001241 Py_UCS4 max_char;
1242 max_char = ucs1lib_find_max_char(from_data,
1243 (Py_UCS1*)from_data + how_many);
1244 if (max_char >= 128)
1245 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001246 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001247 Py_MEMCPY((char*)to_data + to_kind * to_start,
1248 (char*)from_data + from_kind * from_start,
1249 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001251 else if (from_kind == PyUnicode_1BYTE_KIND
1252 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001253 {
1254 _PyUnicode_CONVERT_BYTES(
1255 Py_UCS1, Py_UCS2,
1256 PyUnicode_1BYTE_DATA(from) + from_start,
1257 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1258 PyUnicode_2BYTE_DATA(to) + to_start
1259 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001260 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001261 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001262 && to_kind == PyUnicode_4BYTE_KIND)
1263 {
1264 _PyUnicode_CONVERT_BYTES(
1265 Py_UCS1, Py_UCS4,
1266 PyUnicode_1BYTE_DATA(from) + from_start,
1267 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1268 PyUnicode_4BYTE_DATA(to) + to_start
1269 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001270 }
1271 else if (from_kind == PyUnicode_2BYTE_KIND
1272 && to_kind == PyUnicode_4BYTE_KIND)
1273 {
1274 _PyUnicode_CONVERT_BYTES(
1275 Py_UCS2, Py_UCS4,
1276 PyUnicode_2BYTE_DATA(from) + from_start,
1277 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1278 PyUnicode_4BYTE_DATA(to) + to_start
1279 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001280 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001282 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1283
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001284 if (!check_maxchar) {
1285 if (from_kind == PyUnicode_2BYTE_KIND
1286 && to_kind == PyUnicode_1BYTE_KIND)
1287 {
1288 _PyUnicode_CONVERT_BYTES(
1289 Py_UCS2, Py_UCS1,
1290 PyUnicode_2BYTE_DATA(from) + from_start,
1291 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1292 PyUnicode_1BYTE_DATA(to) + to_start
1293 );
1294 }
1295 else if (from_kind == PyUnicode_4BYTE_KIND
1296 && to_kind == PyUnicode_1BYTE_KIND)
1297 {
1298 _PyUnicode_CONVERT_BYTES(
1299 Py_UCS4, Py_UCS1,
1300 PyUnicode_4BYTE_DATA(from) + from_start,
1301 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1302 PyUnicode_1BYTE_DATA(to) + to_start
1303 );
1304 }
1305 else if (from_kind == PyUnicode_4BYTE_KIND
1306 && to_kind == PyUnicode_2BYTE_KIND)
1307 {
1308 _PyUnicode_CONVERT_BYTES(
1309 Py_UCS4, Py_UCS2,
1310 PyUnicode_4BYTE_DATA(from) + from_start,
1311 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1312 PyUnicode_2BYTE_DATA(to) + to_start
1313 );
1314 }
1315 else {
1316 assert(0);
1317 return -1;
1318 }
1319 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001320 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001321 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001322 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001323 Py_ssize_t i;
1324
Victor Stinnera0702ab2011-09-29 14:14:38 +02001325 for (i=0; i < how_many; i++) {
1326 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001327 if (ch > to_maxchar)
1328 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001329 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1330 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001331 }
1332 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001333 return 0;
1334}
1335
Victor Stinnerd3f08822012-05-29 12:57:52 +02001336void
1337_PyUnicode_FastCopyCharacters(
1338 PyObject *to, Py_ssize_t to_start,
1339 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001340{
1341 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1342}
1343
1344Py_ssize_t
1345PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1346 PyObject *from, Py_ssize_t from_start,
1347 Py_ssize_t how_many)
1348{
1349 int err;
1350
1351 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1352 PyErr_BadInternalCall();
1353 return -1;
1354 }
1355
Benjamin Petersonbac79492012-01-14 13:34:47 -05001356 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001357 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001358 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001359 return -1;
1360
Victor Stinnerd3f08822012-05-29 12:57:52 +02001361 if (from_start < 0) {
1362 PyErr_SetString(PyExc_IndexError, "string index out of range");
1363 return -1;
1364 }
1365 if (to_start < 0) {
1366 PyErr_SetString(PyExc_IndexError, "string index out of range");
1367 return -1;
1368 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001369 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1370 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1371 PyErr_Format(PyExc_SystemError,
1372 "Cannot write %zi characters at %zi "
1373 "in a string of %zi characters",
1374 how_many, to_start, PyUnicode_GET_LENGTH(to));
1375 return -1;
1376 }
1377
1378 if (how_many == 0)
1379 return 0;
1380
Victor Stinner488fa492011-12-12 00:01:39 +01001381 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001382 return -1;
1383
1384 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1385 if (err) {
1386 PyErr_Format(PyExc_SystemError,
1387 "Cannot copy %s characters "
1388 "into a string of %s characters",
1389 unicode_kind_name(from),
1390 unicode_kind_name(to));
1391 return -1;
1392 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001393 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394}
1395
Victor Stinner17222162011-09-28 22:15:37 +02001396/* Find the maximum code point and count the number of surrogate pairs so a
1397 correct string length can be computed before converting a string to UCS4.
1398 This function counts single surrogates as a character and not as a pair.
1399
1400 Return 0 on success, or -1 on error. */
1401static int
1402find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1403 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
1405 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001406 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinnerc53be962011-10-02 21:33:54 +02001408 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 *num_surrogates = 0;
1410 *maxchar = 0;
1411
1412 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001414 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1415 && (iter+1) < end
1416 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1417 {
1418 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1419 ++(*num_surrogates);
1420 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 }
1422 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001424 {
1425 ch = *iter;
1426 iter++;
1427 }
1428 if (ch > *maxchar) {
1429 *maxchar = ch;
1430 if (*maxchar > MAX_UNICODE) {
1431 PyErr_Format(PyExc_ValueError,
1432 "character U+%x is not in range [U+0000; U+10ffff]",
1433 ch);
1434 return -1;
1435 }
1436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 return 0;
1439}
1440
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001441int
1442_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443{
1444 wchar_t *end;
1445 Py_UCS4 maxchar = 0;
1446 Py_ssize_t num_surrogates;
1447#if SIZEOF_WCHAR_T == 2
1448 Py_ssize_t length_wo_surrogates;
1449#endif
1450
Georg Brandl7597add2011-10-05 16:36:47 +02001451 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001452 strings were created using _PyObject_New() and where no canonical
1453 representation (the str field) has been set yet aka strings
1454 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001455 assert(_PyUnicode_CHECK(unicode));
1456 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001459 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001460 /* Actually, it should neither be interned nor be anything else: */
1461 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001464 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001465 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467
1468 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001469 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1470 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 PyErr_NoMemory();
1472 return -1;
1473 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001474 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 _PyUnicode_WSTR(unicode), end,
1476 PyUnicode_1BYTE_DATA(unicode));
1477 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1478 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1479 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1480 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001481 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001482 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001483 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 }
1485 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001486 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001487 _PyUnicode_UTF8(unicode) = NULL;
1488 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 }
1490 PyObject_FREE(_PyUnicode_WSTR(unicode));
1491 _PyUnicode_WSTR(unicode) = NULL;
1492 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1493 }
1494 /* In this case we might have to convert down from 4-byte native
1495 wchar_t to 2-byte unicode. */
1496 else if (maxchar < 65536) {
1497 assert(num_surrogates == 0 &&
1498 "FindMaxCharAndNumSurrogatePairs() messed up");
1499
Victor Stinner506f5922011-09-28 22:34:18 +02001500#if SIZEOF_WCHAR_T == 2
1501 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001502 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001503 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1505 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001506 _PyUnicode_UTF8(unicode) = NULL;
1507 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001508#else
1509 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001510 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001511 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001512 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001513 PyErr_NoMemory();
1514 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 }
Victor Stinner506f5922011-09-28 22:34:18 +02001516 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1517 _PyUnicode_WSTR(unicode), end,
1518 PyUnicode_2BYTE_DATA(unicode));
1519 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1520 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1521 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001522 _PyUnicode_UTF8(unicode) = NULL;
1523 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001524 PyObject_FREE(_PyUnicode_WSTR(unicode));
1525 _PyUnicode_WSTR(unicode) = NULL;
1526 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1527#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528 }
1529 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1530 else {
1531#if SIZEOF_WCHAR_T == 2
1532 /* in case the native representation is 2-bytes, we need to allocate a
1533 new normalized 4-byte version. */
1534 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001535 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1536 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 PyErr_NoMemory();
1538 return -1;
1539 }
1540 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1541 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001542 _PyUnicode_UTF8(unicode) = NULL;
1543 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001544 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1545 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001546 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 PyObject_FREE(_PyUnicode_WSTR(unicode));
1548 _PyUnicode_WSTR(unicode) = NULL;
1549 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1550#else
1551 assert(num_surrogates == 0);
1552
Victor Stinnerc3c74152011-10-02 20:39:55 +02001553 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001555 _PyUnicode_UTF8(unicode) = NULL;
1556 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1558#endif
1559 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1560 }
1561 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001562 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 return 0;
1564}
1565
Alexander Belopolsky40018472011-02-26 01:02:56 +00001566static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001567unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568{
Walter Dörwald16807132007-05-25 13:52:07 +00001569 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 case SSTATE_NOT_INTERNED:
1571 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001572
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 case SSTATE_INTERNED_MORTAL:
1574 /* revive dead object temporarily for DelItem */
1575 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001576 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 Py_FatalError(
1578 "deletion of interned string failed");
1579 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001580
Benjamin Peterson29060642009-01-31 22:14:21 +00001581 case SSTATE_INTERNED_IMMORTAL:
1582 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001583
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 default:
1585 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001586 }
1587
Victor Stinner03490912011-10-03 23:45:12 +02001588 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001590 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001591 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001592 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1593 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001595 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596}
1597
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001598#ifdef Py_DEBUG
1599static int
1600unicode_is_singleton(PyObject *unicode)
1601{
1602 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1603 if (unicode == unicode_empty)
1604 return 1;
1605 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1606 {
1607 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1608 if (ch < 256 && unicode_latin1[ch] == unicode)
1609 return 1;
1610 }
1611 return 0;
1612}
1613#endif
1614
Alexander Belopolsky40018472011-02-26 01:02:56 +00001615static int
Victor Stinner488fa492011-12-12 00:01:39 +01001616unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001617{
Victor Stinner488fa492011-12-12 00:01:39 +01001618 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001619 if (Py_REFCNT(unicode) != 1)
1620 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001621 if (_PyUnicode_HASH(unicode) != -1)
1622 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 if (PyUnicode_CHECK_INTERNED(unicode))
1624 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001625 if (!PyUnicode_CheckExact(unicode))
1626 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001627#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001628 /* singleton refcount is greater than 1 */
1629 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001630#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 return 1;
1632}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001633
Victor Stinnerfe226c02011-10-03 03:52:20 +02001634static int
1635unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1636{
1637 PyObject *unicode;
1638 Py_ssize_t old_length;
1639
1640 assert(p_unicode != NULL);
1641 unicode = *p_unicode;
1642
1643 assert(unicode != NULL);
1644 assert(PyUnicode_Check(unicode));
1645 assert(0 <= length);
1646
Victor Stinner910337b2011-10-03 03:20:16 +02001647 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001648 old_length = PyUnicode_WSTR_LENGTH(unicode);
1649 else
1650 old_length = PyUnicode_GET_LENGTH(unicode);
1651 if (old_length == length)
1652 return 0;
1653
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001655 _Py_INCREF_UNICODE_EMPTY();
1656 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001657 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001658 Py_DECREF(*p_unicode);
1659 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001660 return 0;
1661 }
1662
Victor Stinner488fa492011-12-12 00:01:39 +01001663 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001664 PyObject *copy = resize_copy(unicode, length);
1665 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001666 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001667 Py_DECREF(*p_unicode);
1668 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001670 }
1671
Victor Stinnerfe226c02011-10-03 03:52:20 +02001672 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001673 PyObject *new_unicode = resize_compact(unicode, length);
1674 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001675 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001676 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001677 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001678 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001679 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001680}
1681
Alexander Belopolsky40018472011-02-26 01:02:56 +00001682int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001683PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001684{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001685 PyObject *unicode;
1686 if (p_unicode == NULL) {
1687 PyErr_BadInternalCall();
1688 return -1;
1689 }
1690 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001691 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001692 {
1693 PyErr_BadInternalCall();
1694 return -1;
1695 }
1696 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001697}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001698
Victor Stinnerc5166102012-02-22 13:55:02 +01001699/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001700
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001701 WARNING: The function doesn't copy the terminating null character and
1702 doesn't check the maximum character (may write a latin1 character in an
1703 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001704static void
1705unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1706 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001707{
1708 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1709 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001710 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001711
1712 switch (kind) {
1713 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001714 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001715#ifdef Py_DEBUG
1716 if (PyUnicode_IS_ASCII(unicode)) {
1717 Py_UCS4 maxchar = ucs1lib_find_max_char(
1718 (const Py_UCS1*)str,
1719 (const Py_UCS1*)str + len);
1720 assert(maxchar < 128);
1721 }
1722#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001723 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001724 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001725 }
1726 case PyUnicode_2BYTE_KIND: {
1727 Py_UCS2 *start = (Py_UCS2 *)data + index;
1728 Py_UCS2 *ucs2 = start;
1729 assert(index <= PyUnicode_GET_LENGTH(unicode));
1730
Victor Stinner184252a2012-06-16 02:57:41 +02001731 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 *ucs2 = (Py_UCS2)*str;
1733
1734 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001735 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001736 }
1737 default: {
1738 Py_UCS4 *start = (Py_UCS4 *)data + index;
1739 Py_UCS4 *ucs4 = start;
1740 assert(kind == PyUnicode_4BYTE_KIND);
1741 assert(index <= PyUnicode_GET_LENGTH(unicode));
1742
Victor Stinner184252a2012-06-16 02:57:41 +02001743 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001744 *ucs4 = (Py_UCS4)*str;
1745
1746 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001747 }
1748 }
1749}
1750
1751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752static PyObject*
1753get_latin1_char(unsigned char ch)
1754{
Victor Stinnera464fc12011-10-02 20:39:30 +02001755 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode)
1759 return NULL;
1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 unicode_latin1[ch] = unicode;
1763 }
1764 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001765 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Alexander Belopolsky40018472011-02-26 01:02:56 +00001768PyObject *
1769PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001771 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 Py_UCS4 maxchar = 0;
1773 Py_ssize_t num_surrogates;
1774
1775 if (u == NULL)
1776 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001778 /* If the Unicode data is known at construction time, we can apply
1779 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001782 if (size == 0)
1783 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 /* Single character Unicode objects in the Latin-1 range are
1786 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001787 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 return get_latin1_char((unsigned char)*u);
1789
1790 /* If not empty and not single character, copy the Unicode data
1791 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001792 if (find_maxchar_surrogates(u, u + size,
1793 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 return NULL;
1795
Victor Stinner8faf8212011-12-08 22:14:11 +01001796 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 if (!unicode)
1798 return NULL;
1799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 switch (PyUnicode_KIND(unicode)) {
1801 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001802 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1804 break;
1805 case PyUnicode_2BYTE_KIND:
1806#if Py_UNICODE_SIZE == 2
1807 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1808#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001809 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1811#endif
1812 break;
1813 case PyUnicode_4BYTE_KIND:
1814#if SIZEOF_WCHAR_T == 2
1815 /* This is the only case which has to process surrogates, thus
1816 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001817 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818#else
1819 assert(num_surrogates == 0);
1820 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1821#endif
1822 break;
1823 default:
1824 assert(0 && "Impossible state");
1825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001827 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828}
1829
Alexander Belopolsky40018472011-02-26 01:02:56 +00001830PyObject *
1831PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001832{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001833 if (size < 0) {
1834 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001835 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 return NULL;
1837 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001838 if (u != NULL)
1839 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1840 else
1841 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001842}
1843
Alexander Belopolsky40018472011-02-26 01:02:56 +00001844PyObject *
1845PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001846{
1847 size_t size = strlen(u);
1848 if (size > PY_SSIZE_T_MAX) {
1849 PyErr_SetString(PyExc_OverflowError, "input too long");
1850 return NULL;
1851 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001852 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001853}
1854
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855PyObject *
1856_PyUnicode_FromId(_Py_Identifier *id)
1857{
1858 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001859 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1860 strlen(id->string),
1861 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001862 if (!id->object)
1863 return NULL;
1864 PyUnicode_InternInPlace(&id->object);
1865 assert(!id->next);
1866 id->next = static_strings;
1867 static_strings = id;
1868 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001869 return id->object;
1870}
1871
1872void
1873_PyUnicode_ClearStaticStrings()
1874{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001875 _Py_Identifier *tmp, *s = static_strings;
1876 while (s) {
1877 Py_DECREF(s->object);
1878 s->object = NULL;
1879 tmp = s->next;
1880 s->next = NULL;
1881 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001882 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001883 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001884}
1885
Benjamin Peterson0df54292012-03-26 14:50:32 -04001886/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001887
Victor Stinnerd3f08822012-05-29 12:57:52 +02001888PyObject*
1889_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001890{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001891 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001892 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001893 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001894#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001895 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001896#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001897 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001898 }
Victor Stinner785938e2011-12-11 20:09:03 +01001899 unicode = PyUnicode_New(size, 127);
1900 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001901 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001902 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1903 assert(_PyUnicode_CheckConsistency(unicode, 1));
1904 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001905}
1906
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907static Py_UCS4
1908kind_maxchar_limit(unsigned int kind)
1909{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001910 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001911 case PyUnicode_1BYTE_KIND:
1912 return 0x80;
1913 case PyUnicode_2BYTE_KIND:
1914 return 0x100;
1915 case PyUnicode_4BYTE_KIND:
1916 return 0x10000;
1917 default:
1918 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001919 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001920 }
1921}
1922
Victor Stinnere6abb482012-05-02 01:15:40 +02001923Py_LOCAL_INLINE(Py_UCS4)
1924align_maxchar(Py_UCS4 maxchar)
1925{
1926 if (maxchar <= 127)
1927 return 127;
1928 else if (maxchar <= 255)
1929 return 255;
1930 else if (maxchar <= 65535)
1931 return 65535;
1932 else
1933 return MAX_UNICODE;
1934}
1935
Victor Stinner702c7342011-10-05 13:50:52 +02001936static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001937_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001940 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001941
Serhiy Storchaka678db842013-01-26 12:16:36 +02001942 if (size == 0)
1943 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001944 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001945 if (size == 1)
1946 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001947
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001948 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001949 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 if (!res)
1951 return NULL;
1952 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001953 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001955}
1956
Victor Stinnere57b1c02011-09-28 22:20:48 +02001957static PyObject*
1958_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959{
1960 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001961 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001962
Serhiy Storchaka678db842013-01-26 12:16:36 +02001963 if (size == 0)
1964 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001965 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001966 if (size == 1) {
1967 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001968 int kind;
1969 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001970 if (ch < 256)
1971 return get_latin1_char((unsigned char)ch);
1972
1973 res = PyUnicode_New(1, ch);
1974 if (res == NULL)
1975 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001976 kind = PyUnicode_KIND(res);
1977 data = PyUnicode_DATA(res);
1978 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001979 assert(_PyUnicode_CheckConsistency(res, 1));
1980 return res;
1981 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001982
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001983 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001984 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 if (!res)
1986 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001987 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001989 else {
1990 _PyUnicode_CONVERT_BYTES(
1991 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1992 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001993 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 return res;
1995}
1996
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997static PyObject*
1998_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999{
2000 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002
Serhiy Storchaka678db842013-01-26 12:16:36 +02002003 if (size == 0)
2004 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002005 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002006 if (size == 1) {
2007 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002008 int kind;
2009 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002010 if (ch < 256)
2011 return get_latin1_char((unsigned char)ch);
2012
2013 res = PyUnicode_New(1, ch);
2014 if (res == NULL)
2015 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002016 kind = PyUnicode_KIND(res);
2017 data = PyUnicode_DATA(res);
2018 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002019 assert(_PyUnicode_CheckConsistency(res, 1));
2020 return res;
2021 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002022
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002023 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002024 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 if (!res)
2026 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002027 if (max_char < 256)
2028 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2029 PyUnicode_1BYTE_DATA(res));
2030 else if (max_char < 0x10000)
2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2032 PyUnicode_2BYTE_DATA(res));
2033 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002035 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 return res;
2037}
2038
2039PyObject*
2040PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2041{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002042 if (size < 0) {
2043 PyErr_SetString(PyExc_ValueError, "size must be positive");
2044 return NULL;
2045 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002046 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002048 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002050 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002052 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002053 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002054 PyErr_SetString(PyExc_SystemError, "invalid kind");
2055 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057}
2058
Victor Stinnerece58de2012-04-23 23:36:38 +02002059Py_UCS4
2060_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2061{
2062 enum PyUnicode_Kind kind;
2063 void *startptr, *endptr;
2064
2065 assert(PyUnicode_IS_READY(unicode));
2066 assert(0 <= start);
2067 assert(end <= PyUnicode_GET_LENGTH(unicode));
2068 assert(start <= end);
2069
2070 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2071 return PyUnicode_MAX_CHAR_VALUE(unicode);
2072
2073 if (start == end)
2074 return 127;
2075
Victor Stinner94d558b2012-04-27 22:26:58 +02002076 if (PyUnicode_IS_ASCII(unicode))
2077 return 127;
2078
Victor Stinnerece58de2012-04-23 23:36:38 +02002079 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002080 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002081 endptr = (char *)startptr + end * kind;
2082 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002083 switch(kind) {
2084 case PyUnicode_1BYTE_KIND:
2085 return ucs1lib_find_max_char(startptr, endptr);
2086 case PyUnicode_2BYTE_KIND:
2087 return ucs2lib_find_max_char(startptr, endptr);
2088 case PyUnicode_4BYTE_KIND:
2089 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002090 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002091 assert(0);
2092 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002093 }
2094}
2095
Victor Stinner25a4b292011-10-06 12:31:55 +02002096/* Ensure that a string uses the most efficient storage, if it is not the
2097 case: create a new string with of the right kind. Write NULL into *p_unicode
2098 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002099static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002100unicode_adjust_maxchar(PyObject **p_unicode)
2101{
2102 PyObject *unicode, *copy;
2103 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002104 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002105 unsigned int kind;
2106
2107 assert(p_unicode != NULL);
2108 unicode = *p_unicode;
2109 assert(PyUnicode_IS_READY(unicode));
2110 if (PyUnicode_IS_ASCII(unicode))
2111 return;
2112
2113 len = PyUnicode_GET_LENGTH(unicode);
2114 kind = PyUnicode_KIND(unicode);
2115 if (kind == PyUnicode_1BYTE_KIND) {
2116 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002117 max_char = ucs1lib_find_max_char(u, u + len);
2118 if (max_char >= 128)
2119 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002120 }
2121 else if (kind == PyUnicode_2BYTE_KIND) {
2122 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002123 max_char = ucs2lib_find_max_char(u, u + len);
2124 if (max_char >= 256)
2125 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002126 }
2127 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002128 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002130 max_char = ucs4lib_find_max_char(u, u + len);
2131 if (max_char >= 0x10000)
2132 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002135 if (copy != NULL)
2136 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 Py_DECREF(unicode);
2138 *p_unicode = copy;
2139}
2140
Victor Stinner034f6cf2011-09-30 02:26:44 +02002141PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002142_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002143{
Victor Stinner87af4f22011-11-21 23:03:47 +01002144 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146
Victor Stinner034f6cf2011-09-30 02:26:44 +02002147 if (!PyUnicode_Check(unicode)) {
2148 PyErr_BadInternalCall();
2149 return NULL;
2150 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002151 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002152 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002153
Victor Stinner87af4f22011-11-21 23:03:47 +01002154 length = PyUnicode_GET_LENGTH(unicode);
2155 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156 if (!copy)
2157 return NULL;
2158 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2159
Victor Stinner87af4f22011-11-21 23:03:47 +01002160 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2161 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002162 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002163 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002164}
2165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167/* Widen Unicode objects to larger buffers. Don't write terminating null
2168 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169
2170void*
2171_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2172{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002173 Py_ssize_t len;
2174 void *result;
2175 unsigned int skind;
2176
Benjamin Petersonbac79492012-01-14 13:34:47 -05002177 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002178 return NULL;
2179
2180 len = PyUnicode_GET_LENGTH(s);
2181 skind = PyUnicode_KIND(s);
2182 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002183 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 return NULL;
2185 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002186 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002187 case PyUnicode_2BYTE_KIND:
2188 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2189 if (!result)
2190 return PyErr_NoMemory();
2191 assert(skind == PyUnicode_1BYTE_KIND);
2192 _PyUnicode_CONVERT_BYTES(
2193 Py_UCS1, Py_UCS2,
2194 PyUnicode_1BYTE_DATA(s),
2195 PyUnicode_1BYTE_DATA(s) + len,
2196 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002198 case PyUnicode_4BYTE_KIND:
2199 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2200 if (!result)
2201 return PyErr_NoMemory();
2202 if (skind == PyUnicode_2BYTE_KIND) {
2203 _PyUnicode_CONVERT_BYTES(
2204 Py_UCS2, Py_UCS4,
2205 PyUnicode_2BYTE_DATA(s),
2206 PyUnicode_2BYTE_DATA(s) + len,
2207 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002209 else {
2210 assert(skind == PyUnicode_1BYTE_KIND);
2211 _PyUnicode_CONVERT_BYTES(
2212 Py_UCS1, Py_UCS4,
2213 PyUnicode_1BYTE_DATA(s),
2214 PyUnicode_1BYTE_DATA(s) + len,
2215 result);
2216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002218 default:
2219 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 }
Victor Stinner01698042011-10-04 00:04:26 +02002221 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 return NULL;
2223}
2224
2225static Py_UCS4*
2226as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2227 int copy_null)
2228{
2229 int kind;
2230 void *data;
2231 Py_ssize_t len, targetlen;
2232 if (PyUnicode_READY(string) == -1)
2233 return NULL;
2234 kind = PyUnicode_KIND(string);
2235 data = PyUnicode_DATA(string);
2236 len = PyUnicode_GET_LENGTH(string);
2237 targetlen = len;
2238 if (copy_null)
2239 targetlen++;
2240 if (!target) {
2241 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2242 PyErr_NoMemory();
2243 return NULL;
2244 }
2245 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2246 if (!target) {
2247 PyErr_NoMemory();
2248 return NULL;
2249 }
2250 }
2251 else {
2252 if (targetsize < targetlen) {
2253 PyErr_Format(PyExc_SystemError,
2254 "string is longer than the buffer");
2255 if (copy_null && 0 < targetsize)
2256 target[0] = 0;
2257 return NULL;
2258 }
2259 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 if (kind == PyUnicode_1BYTE_KIND) {
2261 Py_UCS1 *start = (Py_UCS1 *) data;
2262 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002264 else if (kind == PyUnicode_2BYTE_KIND) {
2265 Py_UCS2 *start = (Py_UCS2 *) data;
2266 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2267 }
2268 else {
2269 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 if (copy_null)
2273 target[len] = 0;
2274 return target;
2275}
2276
2277Py_UCS4*
2278PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2279 int copy_null)
2280{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002281 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 PyErr_BadInternalCall();
2283 return NULL;
2284 }
2285 return as_ucs4(string, target, targetsize, copy_null);
2286}
2287
2288Py_UCS4*
2289PyUnicode_AsUCS4Copy(PyObject *string)
2290{
2291 return as_ucs4(string, NULL, 0, 1);
2292}
2293
2294#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002295
Alexander Belopolsky40018472011-02-26 01:02:56 +00002296PyObject *
2297PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002301 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 PyErr_BadInternalCall();
2303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 }
2305
Martin v. Löwis790465f2008-04-05 20:41:37 +00002306 if (size == -1) {
2307 size = wcslen(w);
2308 }
2309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311}
2312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002314
Walter Dörwald346737f2007-05-31 10:44:43 +00002315static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002316makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002317 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002318{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 if (longflag)
2321 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002322 else if (longlongflag) {
2323 /* longlongflag should only ever be nonzero on machines with
2324 HAVE_LONG_LONG defined */
2325#ifdef HAVE_LONG_LONG
2326 char *f = PY_FORMAT_LONG_LONG;
2327 while (*f)
2328 *fmt++ = *f++;
2329#else
2330 /* we shouldn't ever get here */
2331 assert(0);
2332 *fmt++ = 'l';
2333#endif
2334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 else if (size_tflag) {
2336 char *f = PY_FORMAT_SIZE_T;
2337 while (*f)
2338 *fmt++ = *f++;
2339 }
2340 *fmt++ = c;
2341 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002342}
2343
Victor Stinner15a11362012-10-06 23:48:20 +02002344/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002345 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2346 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2347#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002348
2349static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002350unicode_fromformat_arg(_PyUnicodeWriter *writer,
2351 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002352{
Victor Stinnere215d962012-10-06 23:03:36 +02002353 const char *p;
2354 Py_ssize_t len;
2355 int zeropad;
2356 int width;
2357 int precision;
2358 int longflag;
2359 int longlongflag;
2360 int size_tflag;
2361 int fill;
2362
2363 p = f;
2364 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002365 zeropad = 0;
2366 if (*f == '0') {
2367 zeropad = 1;
2368 f++;
2369 }
Victor Stinner96865452011-03-01 23:44:09 +00002370
2371 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002372 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002373 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002374 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2375 PyErr_SetString(PyExc_ValueError,
2376 "width too big");
2377 return NULL;
2378 }
Victor Stinnere215d962012-10-06 23:03:36 +02002379 width = (width*10) + (*f - '0');
2380 f++;
2381 }
Victor Stinner96865452011-03-01 23:44:09 +00002382 precision = 0;
2383 if (*f == '.') {
2384 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002385 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002386 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2387 PyErr_SetString(PyExc_ValueError,
2388 "precision too big");
2389 return NULL;
2390 }
Victor Stinnere215d962012-10-06 23:03:36 +02002391 precision = (precision*10) + (*f - '0');
2392 f++;
2393 }
Victor Stinner96865452011-03-01 23:44:09 +00002394 if (*f == '%') {
2395 /* "%.3%s" => f points to "3" */
2396 f--;
2397 }
2398 }
2399 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002400 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002401 f--;
2402 }
Victor Stinner96865452011-03-01 23:44:09 +00002403
2404 /* Handle %ld, %lu, %lld and %llu. */
2405 longflag = 0;
2406 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002407 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002408 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002409 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002410 longflag = 1;
2411 ++f;
2412 }
2413#ifdef HAVE_LONG_LONG
2414 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002415 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002416 longlongflag = 1;
2417 f += 2;
2418 }
2419#endif
2420 }
2421 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002422 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002423 size_tflag = 1;
2424 ++f;
2425 }
Victor Stinnere215d962012-10-06 23:03:36 +02002426
2427 if (f[1] == '\0')
2428 writer->overallocate = 0;
2429
2430 switch (*f) {
2431 case 'c':
2432 {
2433 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002434 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2435 PyErr_SetString(PyExc_ValueError,
2436 "character argument not in range(0x110000)");
2437 return NULL;
2438 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002439 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002440 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002441 break;
2442 }
2443
2444 case 'i':
2445 case 'd':
2446 case 'u':
2447 case 'x':
2448 {
2449 /* used by sprintf */
2450 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002451 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002452
2453 if (*f == 'u') {
2454 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2455
2456 if (longflag)
2457 len = sprintf(buffer, fmt,
2458 va_arg(*vargs, unsigned long));
2459#ifdef HAVE_LONG_LONG
2460 else if (longlongflag)
2461 len = sprintf(buffer, fmt,
2462 va_arg(*vargs, unsigned PY_LONG_LONG));
2463#endif
2464 else if (size_tflag)
2465 len = sprintf(buffer, fmt,
2466 va_arg(*vargs, size_t));
2467 else
2468 len = sprintf(buffer, fmt,
2469 va_arg(*vargs, unsigned int));
2470 }
2471 else if (*f == 'x') {
2472 makefmt(fmt, 0, 0, 0, 'x');
2473 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2474 }
2475 else {
2476 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2477
2478 if (longflag)
2479 len = sprintf(buffer, fmt,
2480 va_arg(*vargs, long));
2481#ifdef HAVE_LONG_LONG
2482 else if (longlongflag)
2483 len = sprintf(buffer, fmt,
2484 va_arg(*vargs, PY_LONG_LONG));
2485#endif
2486 else if (size_tflag)
2487 len = sprintf(buffer, fmt,
2488 va_arg(*vargs, Py_ssize_t));
2489 else
2490 len = sprintf(buffer, fmt,
2491 va_arg(*vargs, int));
2492 }
2493 assert(len >= 0);
2494
Victor Stinnere215d962012-10-06 23:03:36 +02002495 if (precision < len)
2496 precision = len;
2497 if (width > precision) {
2498 Py_UCS4 fillchar;
2499 fill = width - precision;
2500 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002501 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2502 return NULL;
2503 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2504 return NULL;
2505 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002506 }
Victor Stinner15a11362012-10-06 23:48:20 +02002507 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002508 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002509 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2510 return NULL;
2511 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2512 return NULL;
2513 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002514 }
Victor Stinner15a11362012-10-06 23:48:20 +02002515 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002516 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002517 break;
2518 }
2519
2520 case 'p':
2521 {
2522 char number[MAX_LONG_LONG_CHARS];
2523
2524 len = sprintf(number, "%p", va_arg(*vargs, void*));
2525 assert(len >= 0);
2526
2527 /* %p is ill-defined: ensure leading 0x. */
2528 if (number[1] == 'X')
2529 number[1] = 'x';
2530 else if (number[1] != 'x') {
2531 memmove(number + 2, number,
2532 strlen(number) + 1);
2533 number[0] = '0';
2534 number[1] = 'x';
2535 len += 2;
2536 }
2537
2538 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2539 return NULL;
2540 break;
2541 }
2542
2543 case 's':
2544 {
2545 /* UTF-8 */
2546 const char *s = va_arg(*vargs, const char*);
2547 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2548 if (!str)
2549 return NULL;
2550 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2551 Py_DECREF(str);
2552 return NULL;
2553 }
2554 Py_DECREF(str);
2555 break;
2556 }
2557
2558 case 'U':
2559 {
2560 PyObject *obj = va_arg(*vargs, PyObject *);
2561 assert(obj && _PyUnicode_CHECK(obj));
2562
2563 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2564 return NULL;
2565 break;
2566 }
2567
2568 case 'V':
2569 {
2570 PyObject *obj = va_arg(*vargs, PyObject *);
2571 const char *str = va_arg(*vargs, const char *);
2572 PyObject *str_obj;
2573 assert(obj || str);
2574 if (obj) {
2575 assert(_PyUnicode_CHECK(obj));
2576 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2577 return NULL;
2578 }
2579 else {
2580 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2581 if (!str_obj)
2582 return NULL;
2583 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2584 Py_DECREF(str_obj);
2585 return NULL;
2586 }
2587 Py_DECREF(str_obj);
2588 }
2589 break;
2590 }
2591
2592 case 'S':
2593 {
2594 PyObject *obj = va_arg(*vargs, PyObject *);
2595 PyObject *str;
2596 assert(obj);
2597 str = PyObject_Str(obj);
2598 if (!str)
2599 return NULL;
2600 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2601 Py_DECREF(str);
2602 return NULL;
2603 }
2604 Py_DECREF(str);
2605 break;
2606 }
2607
2608 case 'R':
2609 {
2610 PyObject *obj = va_arg(*vargs, PyObject *);
2611 PyObject *repr;
2612 assert(obj);
2613 repr = PyObject_Repr(obj);
2614 if (!repr)
2615 return NULL;
2616 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2617 Py_DECREF(repr);
2618 return NULL;
2619 }
2620 Py_DECREF(repr);
2621 break;
2622 }
2623
2624 case 'A':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 PyObject *ascii;
2628 assert(obj);
2629 ascii = PyObject_ASCII(obj);
2630 if (!ascii)
2631 return NULL;
2632 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2633 Py_DECREF(ascii);
2634 return NULL;
2635 }
2636 Py_DECREF(ascii);
2637 break;
2638 }
2639
2640 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002641 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002642 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002643 break;
2644
2645 default:
2646 /* if we stumble upon an unknown formatting code, copy the rest
2647 of the format string to the output string. (we cannot just
2648 skip the code, since there's no way to know what's in the
2649 argument list) */
2650 len = strlen(p);
2651 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2652 return NULL;
2653 f = p+len;
2654 return f;
2655 }
2656
2657 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002658 return f;
2659}
2660
Walter Dörwaldd2034312007-05-18 16:29:38 +00002661PyObject *
2662PyUnicode_FromFormatV(const char *format, va_list vargs)
2663{
Victor Stinnere215d962012-10-06 23:03:36 +02002664 va_list vargs2;
2665 const char *f;
2666 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667
Victor Stinnere215d962012-10-06 23:03:36 +02002668 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2669
2670 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2671 Copy it to be able to pass a reference to a subfunction. */
2672 Py_VA_COPY(vargs2, vargs);
2673
2674 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 f = unicode_fromformat_arg(&writer, f, &vargs2);
2677 if (f == NULL)
2678 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002681 const char *p;
2682 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002683
Victor Stinnere215d962012-10-06 23:03:36 +02002684 p = f;
2685 do
2686 {
2687 if ((unsigned char)*p > 127) {
2688 PyErr_Format(PyExc_ValueError,
2689 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2690 "string, got a non-ASCII byte: 0x%02x",
2691 (unsigned char)*p);
2692 return NULL;
2693 }
2694 p++;
2695 }
2696 while (*p != '\0' && *p != '%');
2697 len = p - f;
2698
2699 if (*p == '\0')
2700 writer.overallocate = 0;
2701 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2702 goto fail;
2703 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2704 writer.pos += len;
2705
2706 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 }
Victor Stinnere215d962012-10-06 23:03:36 +02002709 return _PyUnicodeWriter_Finish(&writer);
2710
2711 fail:
2712 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002714}
2715
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716PyObject *
2717PyUnicode_FromFormat(const char *format, ...)
2718{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002719 PyObject* ret;
2720 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002721
2722#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002723 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002724#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002726#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 ret = PyUnicode_FromFormatV(format, vargs);
2728 va_end(vargs);
2729 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002730}
2731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732#ifdef HAVE_WCHAR_H
2733
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2735 convert a Unicode object to a wide character string.
2736
Victor Stinnerd88d9832011-09-06 02:00:05 +02002737 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002738 character) required to convert the unicode object. Ignore size argument.
2739
Victor Stinnerd88d9832011-09-06 02:00:05 +02002740 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002741 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002742 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002744unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002745 wchar_t *w,
2746 Py_ssize_t size)
2747{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749 const wchar_t *wstr;
2750
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002751 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 if (wstr == NULL)
2753 return -1;
2754
Victor Stinner5593d8a2010-10-02 11:11:27 +00002755 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002756 if (size > res)
2757 size = res + 1;
2758 else
2759 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002761 return res;
2762 }
2763 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002765}
2766
2767Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002768PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002769 wchar_t *w,
2770 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771{
2772 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002773 PyErr_BadInternalCall();
2774 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002776 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777}
2778
Victor Stinner137c34c2010-09-29 10:25:54 +00002779wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002780PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002781 Py_ssize_t *size)
2782{
2783 wchar_t* buffer;
2784 Py_ssize_t buflen;
2785
2786 if (unicode == NULL) {
2787 PyErr_BadInternalCall();
2788 return NULL;
2789 }
2790
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002791 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 if (buflen == -1)
2793 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002795 PyErr_NoMemory();
2796 return NULL;
2797 }
2798
Victor Stinner137c34c2010-09-29 10:25:54 +00002799 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2800 if (buffer == NULL) {
2801 PyErr_NoMemory();
2802 return NULL;
2803 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002804 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002805 if (buflen == -1) {
2806 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002808 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809 if (size != NULL)
2810 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002811 return buffer;
2812}
2813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815
Alexander Belopolsky40018472011-02-26 01:02:56 +00002816PyObject *
2817PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002820 void *data;
2821 int kind;
2822
Victor Stinner8faf8212011-12-08 22:14:11 +01002823 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 PyErr_SetString(PyExc_ValueError,
2825 "chr() arg not in range(0x110000)");
2826 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002827 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002828
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002829 if ((Py_UCS4)ordinal < 256)
2830 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 v = PyUnicode_New(1, ordinal);
2833 if (v == NULL)
2834 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002835 kind = PyUnicode_KIND(v);
2836 data = PyUnicode_DATA(v);
2837 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002838 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002840}
2841
Alexander Belopolsky40018472011-02-26 01:02:56 +00002842PyObject *
2843PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002845 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002847 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002848 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002849 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002850 Py_INCREF(obj);
2851 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002852 }
2853 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 /* For a Unicode subtype that's not a Unicode object,
2855 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002856 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002858 PyErr_Format(PyExc_TypeError,
2859 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002860 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002861 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002862}
2863
Alexander Belopolsky40018472011-02-26 01:02:56 +00002864PyObject *
2865PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002866 const char *encoding,
2867 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002868{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002869 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002870 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002871
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002873 PyErr_BadInternalCall();
2874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002876
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 /* Decoding bytes objects is the most common case and should be fast */
2878 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002879 if (PyBytes_GET_SIZE(obj) == 0)
2880 _Py_RETURN_UNICODE_EMPTY();
2881 v = PyUnicode_Decode(
2882 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2883 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002884 return v;
2885 }
2886
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002887 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002888 PyErr_SetString(PyExc_TypeError,
2889 "decoding str is not supported");
2890 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002891 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2894 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2895 PyErr_Format(PyExc_TypeError,
2896 "coercing to str: need bytes, bytearray "
2897 "or buffer-like object, %.80s found",
2898 Py_TYPE(obj)->tp_name);
2899 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002900 }
Tim Petersced69f82003-09-16 20:30:58 +00002901
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002902 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002903 PyBuffer_Release(&buffer);
2904 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002906
Serhiy Storchaka05997252013-01-26 12:14:02 +02002907 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002908 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002909 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910}
2911
Victor Stinner600d3be2010-06-10 12:00:55 +00002912/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002913 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2914 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002915int
2916_Py_normalize_encoding(const char *encoding,
2917 char *lower,
2918 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002920 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002921 char *l;
2922 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002923
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002924 if (encoding == NULL) {
2925 strcpy(lower, "utf-8");
2926 return 1;
2927 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002928 e = encoding;
2929 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002930 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002931 while (*e) {
2932 if (l == l_end)
2933 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002934 if (Py_ISUPPER(*e)) {
2935 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002936 }
2937 else if (*e == '_') {
2938 *l++ = '-';
2939 e++;
2940 }
2941 else {
2942 *l++ = *e++;
2943 }
2944 }
2945 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002946 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002947}
2948
Alexander Belopolsky40018472011-02-26 01:02:56 +00002949PyObject *
2950PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002951 Py_ssize_t size,
2952 const char *encoding,
2953 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002954{
2955 PyObject *buffer = NULL, *unicode;
2956 Py_buffer info;
2957 char lower[11]; /* Enough for any encoding shortcut */
2958
Fred Drakee4315f52000-05-09 19:53:39 +00002959 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002960 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002961 if ((strcmp(lower, "utf-8") == 0) ||
2962 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002963 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002964 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002965 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002966 (strcmp(lower, "iso-8859-1") == 0))
2967 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002968#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002969 else if (strcmp(lower, "mbcs") == 0)
2970 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002971#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002972 else if (strcmp(lower, "ascii") == 0)
2973 return PyUnicode_DecodeASCII(s, size, errors);
2974 else if (strcmp(lower, "utf-16") == 0)
2975 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2976 else if (strcmp(lower, "utf-32") == 0)
2977 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979
2980 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002981 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002982 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002983 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002984 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 if (buffer == NULL)
2986 goto onError;
2987 unicode = PyCodec_Decode(buffer, encoding, errors);
2988 if (unicode == NULL)
2989 goto onError;
2990 if (!PyUnicode_Check(unicode)) {
2991 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002992 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002993 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 Py_DECREF(unicode);
2995 goto onError;
2996 }
2997 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002998 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002999
Benjamin Peterson29060642009-01-31 22:14:21 +00003000 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 Py_XDECREF(buffer);
3002 return NULL;
3003}
3004
Alexander Belopolsky40018472011-02-26 01:02:56 +00003005PyObject *
3006PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003007 const char *encoding,
3008 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003009{
3010 PyObject *v;
3011
3012 if (!PyUnicode_Check(unicode)) {
3013 PyErr_BadArgument();
3014 goto onError;
3015 }
3016
3017 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003019
3020 /* Decode via the codec registry */
3021 v = PyCodec_Decode(unicode, encoding, errors);
3022 if (v == NULL)
3023 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003024 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003025
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003027 return NULL;
3028}
3029
Alexander Belopolsky40018472011-02-26 01:02:56 +00003030PyObject *
3031PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003032 const char *encoding,
3033 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003034{
3035 PyObject *v;
3036
3037 if (!PyUnicode_Check(unicode)) {
3038 PyErr_BadArgument();
3039 goto onError;
3040 }
3041
3042 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044
3045 /* Decode via the codec registry */
3046 v = PyCodec_Decode(unicode, encoding, errors);
3047 if (v == NULL)
3048 goto onError;
3049 if (!PyUnicode_Check(v)) {
3050 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003051 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003052 Py_TYPE(v)->tp_name);
3053 Py_DECREF(v);
3054 goto onError;
3055 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003056 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003057
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003059 return NULL;
3060}
3061
Alexander Belopolsky40018472011-02-26 01:02:56 +00003062PyObject *
3063PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003064 Py_ssize_t size,
3065 const char *encoding,
3066 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067{
3068 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003069
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 unicode = PyUnicode_FromUnicode(s, size);
3071 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3074 Py_DECREF(unicode);
3075 return v;
3076}
3077
Alexander Belopolsky40018472011-02-26 01:02:56 +00003078PyObject *
3079PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003080 const char *encoding,
3081 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003082{
3083 PyObject *v;
3084
3085 if (!PyUnicode_Check(unicode)) {
3086 PyErr_BadArgument();
3087 goto onError;
3088 }
3089
3090 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003092
3093 /* Encode via the codec registry */
3094 v = PyCodec_Encode(unicode, encoding, errors);
3095 if (v == NULL)
3096 goto onError;
3097 return v;
3098
Benjamin Peterson29060642009-01-31 22:14:21 +00003099 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003100 return NULL;
3101}
3102
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003103static size_t
3104wcstombs_errorpos(const wchar_t *wstr)
3105{
3106 size_t len;
3107#if SIZEOF_WCHAR_T == 2
3108 wchar_t buf[3];
3109#else
3110 wchar_t buf[2];
3111#endif
3112 char outbuf[MB_LEN_MAX];
3113 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003114
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003115#if SIZEOF_WCHAR_T == 2
3116 buf[2] = 0;
3117#else
3118 buf[1] = 0;
3119#endif
3120 start = wstr;
3121 while (*wstr != L'\0')
3122 {
3123 previous = wstr;
3124#if SIZEOF_WCHAR_T == 2
3125 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3126 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3127 {
3128 buf[0] = wstr[0];
3129 buf[1] = wstr[1];
3130 wstr += 2;
3131 }
3132 else {
3133 buf[0] = *wstr;
3134 buf[1] = 0;
3135 wstr++;
3136 }
3137#else
3138 buf[0] = *wstr;
3139 wstr++;
3140#endif
3141 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003142 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003143 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003144 }
3145
3146 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003147 return 0;
3148}
3149
Victor Stinner1b579672011-12-17 05:47:23 +01003150static int
3151locale_error_handler(const char *errors, int *surrogateescape)
3152{
3153 if (errors == NULL) {
3154 *surrogateescape = 0;
3155 return 0;
3156 }
3157
3158 if (strcmp(errors, "strict") == 0) {
3159 *surrogateescape = 0;
3160 return 0;
3161 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003162 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003163 *surrogateescape = 1;
3164 return 0;
3165 }
3166 PyErr_Format(PyExc_ValueError,
3167 "only 'strict' and 'surrogateescape' error handlers "
3168 "are supported, not '%s'",
3169 errors);
3170 return -1;
3171}
3172
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003173PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003174PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003175{
3176 Py_ssize_t wlen, wlen2;
3177 wchar_t *wstr;
3178 PyObject *bytes = NULL;
3179 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003180 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003181 PyObject *exc;
3182 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003183 int surrogateescape;
3184
3185 if (locale_error_handler(errors, &surrogateescape) < 0)
3186 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003187
3188 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3189 if (wstr == NULL)
3190 return NULL;
3191
3192 wlen2 = wcslen(wstr);
3193 if (wlen2 != wlen) {
3194 PyMem_Free(wstr);
3195 PyErr_SetString(PyExc_TypeError, "embedded null character");
3196 return NULL;
3197 }
3198
3199 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003200 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003201 char *str;
3202
3203 str = _Py_wchar2char(wstr, &error_pos);
3204 if (str == NULL) {
3205 if (error_pos == (size_t)-1) {
3206 PyErr_NoMemory();
3207 PyMem_Free(wstr);
3208 return NULL;
3209 }
3210 else {
3211 goto encode_error;
3212 }
3213 }
3214 PyMem_Free(wstr);
3215
3216 bytes = PyBytes_FromString(str);
3217 PyMem_Free(str);
3218 }
3219 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003220 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003221 size_t len, len2;
3222
3223 len = wcstombs(NULL, wstr, 0);
3224 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003225 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003226 goto encode_error;
3227 }
3228
3229 bytes = PyBytes_FromStringAndSize(NULL, len);
3230 if (bytes == NULL) {
3231 PyMem_Free(wstr);
3232 return NULL;
3233 }
3234
3235 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3236 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003237 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003238 goto encode_error;
3239 }
3240 PyMem_Free(wstr);
3241 }
3242 return bytes;
3243
3244encode_error:
3245 errmsg = strerror(errno);
3246 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003247
3248 if (error_pos == (size_t)-1)
3249 error_pos = wcstombs_errorpos(wstr);
3250
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003251 PyMem_Free(wstr);
3252 Py_XDECREF(bytes);
3253
Victor Stinner2f197072011-12-17 07:08:30 +01003254 if (errmsg != NULL) {
3255 size_t errlen;
3256 wstr = _Py_char2wchar(errmsg, &errlen);
3257 if (wstr != NULL) {
3258 reason = PyUnicode_FromWideChar(wstr, errlen);
3259 PyMem_Free(wstr);
3260 } else
3261 errmsg = NULL;
3262 }
3263 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003264 reason = PyUnicode_FromString(
3265 "wcstombs() encountered an unencodable "
3266 "wide character");
3267 if (reason == NULL)
3268 return NULL;
3269
3270 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3271 "locale", unicode,
3272 (Py_ssize_t)error_pos,
3273 (Py_ssize_t)(error_pos+1),
3274 reason);
3275 Py_DECREF(reason);
3276 if (exc != NULL) {
3277 PyCodec_StrictErrors(exc);
3278 Py_XDECREF(exc);
3279 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003280 return NULL;
3281}
3282
Victor Stinnerad158722010-10-27 00:25:46 +00003283PyObject *
3284PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003285{
Victor Stinner99b95382011-07-04 14:23:54 +02003286#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003287 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003288#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003289 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003290#else
Victor Stinner793b5312011-04-27 00:24:21 +02003291 PyInterpreterState *interp = PyThreadState_GET()->interp;
3292 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3293 cannot use it to encode and decode filenames before it is loaded. Load
3294 the Python codec requires to encode at least its own filename. Use the C
3295 version of the locale codec until the codec registry is initialized and
3296 the Python codec is loaded.
3297
3298 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3299 cannot only rely on it: check also interp->fscodec_initialized for
3300 subinterpreters. */
3301 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003302 return PyUnicode_AsEncodedString(unicode,
3303 Py_FileSystemDefaultEncoding,
3304 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003305 }
3306 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003307 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003308 }
Victor Stinnerad158722010-10-27 00:25:46 +00003309#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003310}
3311
Alexander Belopolsky40018472011-02-26 01:02:56 +00003312PyObject *
3313PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003314 const char *encoding,
3315 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316{
3317 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003318 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 if (!PyUnicode_Check(unicode)) {
3321 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 }
Fred Drakee4315f52000-05-09 19:53:39 +00003324
Fred Drakee4315f52000-05-09 19:53:39 +00003325 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003326 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003327 if ((strcmp(lower, "utf-8") == 0) ||
3328 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003329 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003330 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003331 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003332 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003333 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003334 }
Victor Stinner37296e82010-06-10 13:36:23 +00003335 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003336 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003337 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003338 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003339#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003340 else if (strcmp(lower, "mbcs") == 0)
3341 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003342#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003343 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003344 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346
3347 /* Encode via the codec registry */
3348 v = PyCodec_Encode(unicode, encoding, errors);
3349 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003350 return NULL;
3351
3352 /* The normal path */
3353 if (PyBytes_Check(v))
3354 return v;
3355
3356 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003357 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003358 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003359 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003360
3361 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3362 "encoder %s returned bytearray instead of bytes",
3363 encoding);
3364 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003365 Py_DECREF(v);
3366 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003367 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003368
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003369 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3370 Py_DECREF(v);
3371 return b;
3372 }
3373
3374 PyErr_Format(PyExc_TypeError,
3375 "encoder did not return a bytes object (type=%.400s)",
3376 Py_TYPE(v)->tp_name);
3377 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003378 return NULL;
3379}
3380
Alexander Belopolsky40018472011-02-26 01:02:56 +00003381PyObject *
3382PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003383 const char *encoding,
3384 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003385{
3386 PyObject *v;
3387
3388 if (!PyUnicode_Check(unicode)) {
3389 PyErr_BadArgument();
3390 goto onError;
3391 }
3392
3393 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003395
3396 /* Encode via the codec registry */
3397 v = PyCodec_Encode(unicode, encoding, errors);
3398 if (v == NULL)
3399 goto onError;
3400 if (!PyUnicode_Check(v)) {
3401 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003402 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003403 Py_TYPE(v)->tp_name);
3404 Py_DECREF(v);
3405 goto onError;
3406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003408
Benjamin Peterson29060642009-01-31 22:14:21 +00003409 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 return NULL;
3411}
3412
Victor Stinner2f197072011-12-17 07:08:30 +01003413static size_t
3414mbstowcs_errorpos(const char *str, size_t len)
3415{
3416#ifdef HAVE_MBRTOWC
3417 const char *start = str;
3418 mbstate_t mbs;
3419 size_t converted;
3420 wchar_t ch;
3421
3422 memset(&mbs, 0, sizeof mbs);
3423 while (len)
3424 {
3425 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3426 if (converted == 0)
3427 /* Reached end of string */
3428 break;
3429 if (converted == (size_t)-1 || converted == (size_t)-2) {
3430 /* Conversion error or incomplete character */
3431 return str - start;
3432 }
3433 else {
3434 str += converted;
3435 len -= converted;
3436 }
3437 }
3438 /* failed to find the undecodable byte sequence */
3439 return 0;
3440#endif
3441 return 0;
3442}
3443
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003444PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003445PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003446 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003447{
3448 wchar_t smallbuf[256];
3449 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3450 wchar_t *wstr;
3451 size_t wlen, wlen2;
3452 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003453 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003454 size_t error_pos;
3455 char *errmsg;
3456 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003457
3458 if (locale_error_handler(errors, &surrogateescape) < 0)
3459 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003460
3461 if (str[len] != '\0' || len != strlen(str)) {
3462 PyErr_SetString(PyExc_TypeError, "embedded null character");
3463 return NULL;
3464 }
3465
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003466 if (surrogateescape) {
3467 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003468 wstr = _Py_char2wchar(str, &wlen);
3469 if (wstr == NULL) {
3470 if (wlen == (size_t)-1)
3471 PyErr_NoMemory();
3472 else
3473 PyErr_SetFromErrno(PyExc_OSError);
3474 return NULL;
3475 }
3476
3477 unicode = PyUnicode_FromWideChar(wstr, wlen);
3478 PyMem_Free(wstr);
3479 }
3480 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003481 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003482#ifndef HAVE_BROKEN_MBSTOWCS
3483 wlen = mbstowcs(NULL, str, 0);
3484#else
3485 wlen = len;
3486#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003487 if (wlen == (size_t)-1)
3488 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003489 if (wlen+1 <= smallbuf_len) {
3490 wstr = smallbuf;
3491 }
3492 else {
3493 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3494 return PyErr_NoMemory();
3495
3496 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3497 if (!wstr)
3498 return PyErr_NoMemory();
3499 }
3500
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501 wlen2 = mbstowcs(wstr, str, wlen+1);
3502 if (wlen2 == (size_t)-1) {
3503 if (wstr != smallbuf)
3504 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003505 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506 }
3507#ifdef HAVE_BROKEN_MBSTOWCS
3508 assert(wlen2 == wlen);
3509#endif
3510 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3511 if (wstr != smallbuf)
3512 PyMem_Free(wstr);
3513 }
3514 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003515
3516decode_error:
3517 errmsg = strerror(errno);
3518 assert(errmsg != NULL);
3519
3520 error_pos = mbstowcs_errorpos(str, len);
3521 if (errmsg != NULL) {
3522 size_t errlen;
3523 wstr = _Py_char2wchar(errmsg, &errlen);
3524 if (wstr != NULL) {
3525 reason = PyUnicode_FromWideChar(wstr, errlen);
3526 PyMem_Free(wstr);
3527 } else
3528 errmsg = NULL;
3529 }
3530 if (errmsg == NULL)
3531 reason = PyUnicode_FromString(
3532 "mbstowcs() encountered an invalid multibyte sequence");
3533 if (reason == NULL)
3534 return NULL;
3535
3536 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3537 "locale", str, len,
3538 (Py_ssize_t)error_pos,
3539 (Py_ssize_t)(error_pos+1),
3540 reason);
3541 Py_DECREF(reason);
3542 if (exc != NULL) {
3543 PyCodec_StrictErrors(exc);
3544 Py_XDECREF(exc);
3545 }
3546 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547}
3548
3549PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003550PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003551{
3552 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003553 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003554}
3555
3556
3557PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003558PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003559 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003560 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3561}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003562
Christian Heimes5894ba72007-11-04 11:43:14 +00003563PyObject*
3564PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3565{
Victor Stinner99b95382011-07-04 14:23:54 +02003566#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003567 return PyUnicode_DecodeMBCS(s, size, NULL);
3568#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003569 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003570#else
Victor Stinner793b5312011-04-27 00:24:21 +02003571 PyInterpreterState *interp = PyThreadState_GET()->interp;
3572 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3573 cannot use it to encode and decode filenames before it is loaded. Load
3574 the Python codec requires to encode at least its own filename. Use the C
3575 version of the locale codec until the codec registry is initialized and
3576 the Python codec is loaded.
3577
3578 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3579 cannot only rely on it: check also interp->fscodec_initialized for
3580 subinterpreters. */
3581 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003582 return PyUnicode_Decode(s, size,
3583 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003584 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003585 }
3586 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003587 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003588 }
Victor Stinnerad158722010-10-27 00:25:46 +00003589#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003590}
3591
Martin v. Löwis011e8422009-05-05 04:43:17 +00003592
3593int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003594_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003595{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003596 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003597
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003598 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003599 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003600 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3601 PyUnicode_GET_LENGTH(str), '\0', 1);
3602 if (pos == -1)
3603 return 0;
3604 else
3605 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003606}
3607
Antoine Pitrou13348842012-01-29 18:36:34 +01003608int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003609PyUnicode_FSConverter(PyObject* arg, void* addr)
3610{
3611 PyObject *output = NULL;
3612 Py_ssize_t size;
3613 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003614 if (arg == NULL) {
3615 Py_DECREF(*(PyObject**)addr);
3616 return 1;
3617 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003618 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003619 output = arg;
3620 Py_INCREF(output);
3621 }
3622 else {
3623 arg = PyUnicode_FromObject(arg);
3624 if (!arg)
3625 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003626 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003627 Py_DECREF(arg);
3628 if (!output)
3629 return 0;
3630 if (!PyBytes_Check(output)) {
3631 Py_DECREF(output);
3632 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3633 return 0;
3634 }
3635 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003636 size = PyBytes_GET_SIZE(output);
3637 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003638 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003639 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003640 Py_DECREF(output);
3641 return 0;
3642 }
3643 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003644 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003645}
3646
3647
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003648int
3649PyUnicode_FSDecoder(PyObject* arg, void* addr)
3650{
3651 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003652 if (arg == NULL) {
3653 Py_DECREF(*(PyObject**)addr);
3654 return 1;
3655 }
3656 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003657 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003658 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003659 output = arg;
3660 Py_INCREF(output);
3661 }
3662 else {
3663 arg = PyBytes_FromObject(arg);
3664 if (!arg)
3665 return 0;
3666 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3667 PyBytes_GET_SIZE(arg));
3668 Py_DECREF(arg);
3669 if (!output)
3670 return 0;
3671 if (!PyUnicode_Check(output)) {
3672 Py_DECREF(output);
3673 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3674 return 0;
3675 }
3676 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003677 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003678 Py_DECREF(output);
3679 return 0;
3680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003681 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003682 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003683 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3684 Py_DECREF(output);
3685 return 0;
3686 }
3687 *(PyObject**)addr = output;
3688 return Py_CLEANUP_SUPPORTED;
3689}
3690
3691
Martin v. Löwis5b222132007-06-10 09:51:05 +00003692char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003693PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003694{
Christian Heimesf3863112007-11-22 07:46:41 +00003695 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003696
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003697 if (!PyUnicode_Check(unicode)) {
3698 PyErr_BadArgument();
3699 return NULL;
3700 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003701 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003702 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003703
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003704 if (PyUnicode_UTF8(unicode) == NULL) {
3705 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3707 if (bytes == NULL)
3708 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003709 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3710 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 Py_DECREF(bytes);
3712 return NULL;
3713 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003714 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3715 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3716 PyBytes_AS_STRING(bytes),
3717 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718 Py_DECREF(bytes);
3719 }
3720
3721 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003722 *psize = PyUnicode_UTF8_LENGTH(unicode);
3723 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003724}
3725
3726char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003727PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003728{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3730}
3731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003732Py_UNICODE *
3733PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 const unsigned char *one_byte;
3736#if SIZEOF_WCHAR_T == 4
3737 const Py_UCS2 *two_bytes;
3738#else
3739 const Py_UCS4 *four_bytes;
3740 const Py_UCS4 *ucs4_end;
3741 Py_ssize_t num_surrogates;
3742#endif
3743 wchar_t *w;
3744 wchar_t *wchar_end;
3745
3746 if (!PyUnicode_Check(unicode)) {
3747 PyErr_BadArgument();
3748 return NULL;
3749 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 assert(_PyUnicode_KIND(unicode) != 0);
3753 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003754
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003755 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3758 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759 num_surrogates = 0;
3760
3761 for (; four_bytes < ucs4_end; ++four_bytes) {
3762 if (*four_bytes > 0xFFFF)
3763 ++num_surrogates;
3764 }
3765
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003766 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3767 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3768 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 PyErr_NoMemory();
3770 return NULL;
3771 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 w = _PyUnicode_WSTR(unicode);
3775 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3776 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3778 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003779 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003780 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003781 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3782 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 }
3784 else
3785 *w = *four_bytes;
3786
3787 if (w > wchar_end) {
3788 assert(0 && "Miscalculated string end");
3789 }
3790 }
3791 *w = 0;
3792#else
3793 /* sizeof(wchar_t) == 4 */
3794 Py_FatalError("Impossible unicode object state, wstr and str "
3795 "should share memory already.");
3796 return NULL;
3797#endif
3798 }
3799 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003800 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3801 (_PyUnicode_LENGTH(unicode) + 1));
3802 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803 PyErr_NoMemory();
3804 return NULL;
3805 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003806 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3807 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3808 w = _PyUnicode_WSTR(unicode);
3809 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3812 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003813 for (; w < wchar_end; ++one_byte, ++w)
3814 *w = *one_byte;
3815 /* null-terminate the wstr */
3816 *w = 0;
3817 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003820 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 for (; w < wchar_end; ++two_bytes, ++w)
3822 *w = *two_bytes;
3823 /* null-terminate the wstr */
3824 *w = 0;
3825#else
3826 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 PyObject_FREE(_PyUnicode_WSTR(unicode));
3828 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 Py_FatalError("Impossible unicode object state, wstr "
3830 "and str should share memory already.");
3831 return NULL;
3832#endif
3833 }
3834 else {
3835 assert(0 && "This should never happen.");
3836 }
3837 }
3838 }
3839 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 *size = PyUnicode_WSTR_LENGTH(unicode);
3841 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003842}
3843
Alexander Belopolsky40018472011-02-26 01:02:56 +00003844Py_UNICODE *
3845PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848}
3849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850
Alexander Belopolsky40018472011-02-26 01:02:56 +00003851Py_ssize_t
3852PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853{
3854 if (!PyUnicode_Check(unicode)) {
3855 PyErr_BadArgument();
3856 goto onError;
3857 }
3858 return PyUnicode_GET_SIZE(unicode);
3859
Benjamin Peterson29060642009-01-31 22:14:21 +00003860 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 return -1;
3862}
3863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864Py_ssize_t
3865PyUnicode_GetLength(PyObject *unicode)
3866{
Victor Stinner07621332012-06-16 04:53:46 +02003867 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868 PyErr_BadArgument();
3869 return -1;
3870 }
Victor Stinner07621332012-06-16 04:53:46 +02003871 if (PyUnicode_READY(unicode) == -1)
3872 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 return PyUnicode_GET_LENGTH(unicode);
3874}
3875
3876Py_UCS4
3877PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3878{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003879 void *data;
3880 int kind;
3881
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003882 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3883 PyErr_BadArgument();
3884 return (Py_UCS4)-1;
3885 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003886 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003887 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888 return (Py_UCS4)-1;
3889 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003890 data = PyUnicode_DATA(unicode);
3891 kind = PyUnicode_KIND(unicode);
3892 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893}
3894
3895int
3896PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3897{
3898 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003899 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900 return -1;
3901 }
Victor Stinner488fa492011-12-12 00:01:39 +01003902 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003903 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003904 PyErr_SetString(PyExc_IndexError, "string index out of range");
3905 return -1;
3906 }
Victor Stinner488fa492011-12-12 00:01:39 +01003907 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003908 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003909 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3910 PyErr_SetString(PyExc_ValueError, "character out of range");
3911 return -1;
3912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3914 index, ch);
3915 return 0;
3916}
3917
Alexander Belopolsky40018472011-02-26 01:02:56 +00003918const char *
3919PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003920{
Victor Stinner42cb4622010-09-01 19:39:01 +00003921 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003922}
3923
Victor Stinner554f3f02010-06-16 23:33:54 +00003924/* create or adjust a UnicodeDecodeError */
3925static void
3926make_decode_exception(PyObject **exceptionObject,
3927 const char *encoding,
3928 const char *input, Py_ssize_t length,
3929 Py_ssize_t startpos, Py_ssize_t endpos,
3930 const char *reason)
3931{
3932 if (*exceptionObject == NULL) {
3933 *exceptionObject = PyUnicodeDecodeError_Create(
3934 encoding, input, length, startpos, endpos, reason);
3935 }
3936 else {
3937 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3938 goto onError;
3939 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3940 goto onError;
3941 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3942 goto onError;
3943 }
3944 return;
3945
3946onError:
3947 Py_DECREF(*exceptionObject);
3948 *exceptionObject = NULL;
3949}
3950
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003951#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952/* error handling callback helper:
3953 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003954 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 and adjust various state variables.
3956 return 0 on success, -1 on error
3957*/
3958
Alexander Belopolsky40018472011-02-26 01:02:56 +00003959static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003960unicode_decode_call_errorhandler_wchar(
3961 const char *errors, PyObject **errorHandler,
3962 const char *encoding, const char *reason,
3963 const char **input, const char **inend, Py_ssize_t *startinpos,
3964 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3965 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003967 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968
3969 PyObject *restuple = NULL;
3970 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003971 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003972 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003973 Py_ssize_t requiredsize;
3974 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003975 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003976 wchar_t *repwstr;
3977 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003979 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3980 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003981
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 *errorHandler = PyCodec_LookupError(errors);
3984 if (*errorHandler == NULL)
3985 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 }
3987
Victor Stinner554f3f02010-06-16 23:33:54 +00003988 make_decode_exception(exceptionObject,
3989 encoding,
3990 *input, *inend - *input,
3991 *startinpos, *endinpos,
3992 reason);
3993 if (*exceptionObject == NULL)
3994 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995
3996 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3997 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004000 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 }
4003 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004005
4006 /* Copy back the bytes variables, which might have been modified by the
4007 callback */
4008 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4009 if (!inputobj)
4010 goto onError;
4011 if (!PyBytes_Check(inputobj)) {
4012 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4013 }
4014 *input = PyBytes_AS_STRING(inputobj);
4015 insize = PyBytes_GET_SIZE(inputobj);
4016 *inend = *input + insize;
4017 /* we can DECREF safely, as the exception has another reference,
4018 so the object won't go away. */
4019 Py_DECREF(inputobj);
4020
4021 if (newpos<0)
4022 newpos = insize+newpos;
4023 if (newpos<0 || newpos>insize) {
4024 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4025 goto onError;
4026 }
4027
4028 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4029 if (repwstr == NULL)
4030 goto onError;
4031 /* need more space? (at least enough for what we
4032 have+the replacement+the rest of the string (starting
4033 at the new input position), so we won't have to check space
4034 when there are no errors in the rest of the string) */
4035 requiredsize = *outpos + repwlen + insize-newpos;
4036 if (requiredsize > outsize) {
4037 if (requiredsize < 2*outsize)
4038 requiredsize = 2*outsize;
4039 if (unicode_resize(output, requiredsize) < 0)
4040 goto onError;
4041 }
4042 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4043 *outpos += repwlen;
4044
4045 *endinpos = newpos;
4046 *inptr = *input + newpos;
4047
4048 /* we made it! */
4049 Py_XDECREF(restuple);
4050 return 0;
4051
4052 onError:
4053 Py_XDECREF(restuple);
4054 return -1;
4055}
4056#endif /* HAVE_MBCS */
4057
4058static int
4059unicode_decode_call_errorhandler_writer(
4060 const char *errors, PyObject **errorHandler,
4061 const char *encoding, const char *reason,
4062 const char **input, const char **inend, Py_ssize_t *startinpos,
4063 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4064 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4065{
4066 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4067
4068 PyObject *restuple = NULL;
4069 PyObject *repunicode = NULL;
4070 Py_ssize_t insize;
4071 Py_ssize_t newpos;
4072 PyObject *inputobj = NULL;
4073
4074 if (*errorHandler == NULL) {
4075 *errorHandler = PyCodec_LookupError(errors);
4076 if (*errorHandler == NULL)
4077 goto onError;
4078 }
4079
4080 make_decode_exception(exceptionObject,
4081 encoding,
4082 *input, *inend - *input,
4083 *startinpos, *endinpos,
4084 reason);
4085 if (*exceptionObject == NULL)
4086 goto onError;
4087
4088 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4089 if (restuple == NULL)
4090 goto onError;
4091 if (!PyTuple_Check(restuple)) {
4092 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4093 goto onError;
4094 }
4095 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004096 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004097
4098 /* Copy back the bytes variables, which might have been modified by the
4099 callback */
4100 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4101 if (!inputobj)
4102 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004103 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004104 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004105 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004106 *input = PyBytes_AS_STRING(inputobj);
4107 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004108 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004109 /* we can DECREF safely, as the exception has another reference,
4110 so the object won't go away. */
4111 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004115 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4117 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004118 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004120 writer->overallocate = 1;
4121 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4122 return
4123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004125 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004128 Py_XDECREF(restuple);
4129 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134}
4135
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004136/* --- UTF-7 Codec -------------------------------------------------------- */
4137
Antoine Pitrou244651a2009-05-04 18:56:13 +00004138/* See RFC2152 for details. We encode conservatively and decode liberally. */
4139
4140/* Three simple macros defining base-64. */
4141
4142/* Is c a base-64 character? */
4143
4144#define IS_BASE64(c) \
4145 (((c) >= 'A' && (c) <= 'Z') || \
4146 ((c) >= 'a' && (c) <= 'z') || \
4147 ((c) >= '0' && (c) <= '9') || \
4148 (c) == '+' || (c) == '/')
4149
4150/* given that c is a base-64 character, what is its base-64 value? */
4151
4152#define FROM_BASE64(c) \
4153 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4154 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4155 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4156 (c) == '+' ? 62 : 63)
4157
4158/* What is the base-64 character of the bottom 6 bits of n? */
4159
4160#define TO_BASE64(n) \
4161 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4162
4163/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4164 * decoded as itself. We are permissive on decoding; the only ASCII
4165 * byte not decoding to itself is the + which begins a base64
4166 * string. */
4167
4168#define DECODE_DIRECT(c) \
4169 ((c) <= 127 && (c) != '+')
4170
4171/* The UTF-7 encoder treats ASCII characters differently according to
4172 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4173 * the above). See RFC2152. This array identifies these different
4174 * sets:
4175 * 0 : "Set D"
4176 * alphanumeric and '(),-./:?
4177 * 1 : "Set O"
4178 * !"#$%&*;<=>@[]^_`{|}
4179 * 2 : "whitespace"
4180 * ht nl cr sp
4181 * 3 : special (must be base64 encoded)
4182 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4183 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004184
Tim Petersced69f82003-09-16 20:30:58 +00004185static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004186char utf7_category[128] = {
4187/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4188 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4189/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4190 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4191/* sp ! " # $ % & ' ( ) * + , - . / */
4192 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4193/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4195/* @ A B C D E F G H I J K L M N O */
4196 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4197/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4199/* ` a b c d e f g h i j k l m n o */
4200 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4201/* p q r s t u v w x y z { | } ~ del */
4202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004203};
4204
Antoine Pitrou244651a2009-05-04 18:56:13 +00004205/* ENCODE_DIRECT: this character should be encoded as itself. The
4206 * answer depends on whether we are encoding set O as itself, and also
4207 * on whether we are encoding whitespace as itself. RFC2152 makes it
4208 * clear that the answers to these questions vary between
4209 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004210
Antoine Pitrou244651a2009-05-04 18:56:13 +00004211#define ENCODE_DIRECT(c, directO, directWS) \
4212 ((c) < 128 && (c) > 0 && \
4213 ((utf7_category[(c)] == 0) || \
4214 (directWS && (utf7_category[(c)] == 2)) || \
4215 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004216
Alexander Belopolsky40018472011-02-26 01:02:56 +00004217PyObject *
4218PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004219 Py_ssize_t size,
4220 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004221{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004222 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4223}
4224
Antoine Pitrou244651a2009-05-04 18:56:13 +00004225/* The decoder. The only state we preserve is our read position,
4226 * i.e. how many characters we have consumed. So if we end in the
4227 * middle of a shift sequence we have to back off the read position
4228 * and the output to the beginning of the sequence, otherwise we lose
4229 * all the shift state (seen bits, number of bits seen, high
4230 * surrogate). */
4231
Alexander Belopolsky40018472011-02-26 01:02:56 +00004232PyObject *
4233PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004234 Py_ssize_t size,
4235 const char *errors,
4236 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004237{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004239 Py_ssize_t startinpos;
4240 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004242 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004243 const char *errmsg = "";
4244 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004245 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004246 unsigned int base64bits = 0;
4247 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004248 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249 PyObject *errorHandler = NULL;
4250 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004251
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004252 if (size == 0) {
4253 if (consumed)
4254 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004255 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004256 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004257
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004258 /* Start off assuming it's all ASCII. Widen later as necessary. */
4259 _PyUnicodeWriter_Init(&writer, 0);
4260 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4261 goto onError;
4262
4263 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004264 e = s + size;
4265
4266 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004267 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004269 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270
Antoine Pitrou244651a2009-05-04 18:56:13 +00004271 if (inShift) { /* in a base-64 section */
4272 if (IS_BASE64(ch)) { /* consume a base-64 character */
4273 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4274 base64bits += 6;
4275 s++;
4276 if (base64bits >= 16) {
4277 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004278 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004279 base64bits -= 16;
4280 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4281 if (surrogate) {
4282 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004283 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4284 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004285 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004286 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004288 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004289 }
4290 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004291 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004292 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004293 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004294 }
4295 }
Victor Stinner551ac952011-11-29 22:58:13 +01004296 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297 /* first surrogate */
4298 surrogate = outCh;
4299 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004300 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004301 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004302 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 }
4304 }
4305 }
4306 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 inShift = 0;
4308 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004310 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004311 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004312 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 if (base64bits > 0) { /* left-over bits */
4315 if (base64bits >= 6) {
4316 /* We've seen at least one base-64 character */
4317 errmsg = "partial character in shift sequence";
4318 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004319 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320 else {
4321 /* Some bits remain; they should be zero */
4322 if (base64buffer != 0) {
4323 errmsg = "non-zero padding bits in shift sequence";
4324 goto utf7Error;
4325 }
4326 }
4327 }
4328 if (ch != '-') {
4329 /* '-' is absorbed; other terminating
4330 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004331 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004332 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 }
4335 }
4336 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004337 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004338 s++; /* consume '+' */
4339 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004340 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004341 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004342 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 }
4344 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004348 }
4349 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004352 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004353 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004354 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 else {
4356 startinpos = s-starts;
4357 s++;
4358 errmsg = "unexpected special character";
4359 goto utf7Error;
4360 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004364 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 errors, &errorHandler,
4366 "utf7", errmsg,
4367 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004368 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 }
4371
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 /* end of string */
4373
4374 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4375 /* if we're in an inconsistent state, that's an error */
4376 if (surrogate ||
4377 (base64bits >= 6) ||
4378 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004380 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 errors, &errorHandler,
4382 "utf7", "unterminated shift sequence",
4383 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004384 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 goto onError;
4386 if (s < e)
4387 goto restart;
4388 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390
4391 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004392 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004394 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004395 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 }
4397 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004398 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 Py_XDECREF(errorHandler);
4403 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004404 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 Py_XDECREF(errorHandler);
4408 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004409 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 return NULL;
4411}
4412
4413
Alexander Belopolsky40018472011-02-26 01:02:56 +00004414PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004415_PyUnicode_EncodeUTF7(PyObject *str,
4416 int base64SetO,
4417 int base64WhiteSpace,
4418 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004420 int kind;
4421 void *data;
4422 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004423 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004425 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 unsigned int base64bits = 0;
4427 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 char * out;
4429 char * start;
4430
Benjamin Petersonbac79492012-01-14 13:34:47 -05004431 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004432 return NULL;
4433 kind = PyUnicode_KIND(str);
4434 data = PyUnicode_DATA(str);
4435 len = PyUnicode_GET_LENGTH(str);
4436
4437 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004440 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004441 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004442 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004443 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 if (v == NULL)
4445 return NULL;
4446
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004447 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004448 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004449 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 if (inShift) {
4452 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4453 /* shifting out */
4454 if (base64bits) { /* output remaining bits */
4455 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4456 base64buffer = 0;
4457 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 }
4459 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 /* Characters not in the BASE64 set implicitly unshift the sequence
4461 so no '-' is required, except if the character is itself a '-' */
4462 if (IS_BASE64(ch) || ch == '-') {
4463 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 *out++ = (char) ch;
4466 }
4467 else {
4468 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004469 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 else { /* not in a shift sequence */
4472 if (ch == '+') {
4473 *out++ = '+';
4474 *out++ = '-';
4475 }
4476 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4477 *out++ = (char) ch;
4478 }
4479 else {
4480 *out++ = '+';
4481 inShift = 1;
4482 goto encode_char;
4483 }
4484 }
4485 continue;
4486encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004488 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004489
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490 /* code first surrogate */
4491 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004492 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 while (base64bits >= 6) {
4494 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4495 base64bits -= 6;
4496 }
4497 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004498 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 base64bits += 16;
4501 base64buffer = (base64buffer << 16) | ch;
4502 while (base64bits >= 6) {
4503 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4504 base64bits -= 6;
4505 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 if (base64bits)
4508 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4509 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004511 if (_PyBytes_Resize(&v, out - start) < 0)
4512 return NULL;
4513 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004515PyObject *
4516PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4517 Py_ssize_t size,
4518 int base64SetO,
4519 int base64WhiteSpace,
4520 const char *errors)
4521{
4522 PyObject *result;
4523 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4524 if (tmp == NULL)
4525 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004526 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004527 base64WhiteSpace, errors);
4528 Py_DECREF(tmp);
4529 return result;
4530}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531
Antoine Pitrou244651a2009-05-04 18:56:13 +00004532#undef IS_BASE64
4533#undef FROM_BASE64
4534#undef TO_BASE64
4535#undef DECODE_DIRECT
4536#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538/* --- UTF-8 Codec -------------------------------------------------------- */
4539
Alexander Belopolsky40018472011-02-26 01:02:56 +00004540PyObject *
4541PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004542 Py_ssize_t size,
4543 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544{
Walter Dörwald69652032004-09-07 20:24:22 +00004545 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4546}
4547
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004548#include "stringlib/asciilib.h"
4549#include "stringlib/codecs.h"
4550#include "stringlib/undef.h"
4551
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004552#include "stringlib/ucs1lib.h"
4553#include "stringlib/codecs.h"
4554#include "stringlib/undef.h"
4555
4556#include "stringlib/ucs2lib.h"
4557#include "stringlib/codecs.h"
4558#include "stringlib/undef.h"
4559
4560#include "stringlib/ucs4lib.h"
4561#include "stringlib/codecs.h"
4562#include "stringlib/undef.h"
4563
Antoine Pitrouab868312009-01-10 15:40:25 +00004564/* Mask to quickly check whether a C 'long' contains a
4565 non-ASCII, UTF8-encoded char. */
4566#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004567# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004568#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004569# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004570#else
4571# error C 'long' size should be either 4 or 8!
4572#endif
4573
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004574static Py_ssize_t
4575ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004576{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004577 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004578 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004580#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004581 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4582 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004583 /* Fast path, see in STRINGLIB(utf8_decode) for
4584 an explanation. */
4585 /* Help register allocation */
4586 register const char *_p = p;
4587 register Py_UCS1 * q = dest;
4588 while (_p < aligned_end) {
4589 unsigned long value = *(const unsigned long *) _p;
4590 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004592 *((unsigned long *)q) = value;
4593 _p += SIZEOF_LONG;
4594 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004595 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004596 p = _p;
4597 while (p < end) {
4598 if ((unsigned char)*p & 0x80)
4599 break;
4600 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004602 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004604#endif
4605 while (p < end) {
4606 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4607 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004608 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004609 /* Help register allocation */
4610 register const char *_p = p;
4611 while (_p < aligned_end) {
4612 unsigned long value = *(unsigned long *) _p;
4613 if (value & ASCII_CHAR_MASK)
4614 break;
4615 _p += SIZEOF_LONG;
4616 }
4617 p = _p;
4618 if (_p == end)
4619 break;
4620 }
4621 if ((unsigned char)*p & 0x80)
4622 break;
4623 ++p;
4624 }
4625 memcpy(dest, start, p - start);
4626 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627}
Antoine Pitrouab868312009-01-10 15:40:25 +00004628
Victor Stinner785938e2011-12-11 20:09:03 +01004629PyObject *
4630PyUnicode_DecodeUTF8Stateful(const char *s,
4631 Py_ssize_t size,
4632 const char *errors,
4633 Py_ssize_t *consumed)
4634{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004635 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004636 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004637 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004638
4639 Py_ssize_t startinpos;
4640 Py_ssize_t endinpos;
4641 const char *errmsg = "";
4642 PyObject *errorHandler = NULL;
4643 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004644
4645 if (size == 0) {
4646 if (consumed)
4647 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004648 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004649 }
4650
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4652 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004653 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654 *consumed = 1;
4655 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004656 }
4657
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004658 _PyUnicodeWriter_Init(&writer, 0);
4659 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4660 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004661
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004662 writer.pos = ascii_decode(s, end, writer.data);
4663 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664 while (s < end) {
4665 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004666 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004668 if (PyUnicode_IS_ASCII(writer.buffer))
4669 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004671 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004672 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 } else {
4675 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004676 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 }
4678
4679 switch (ch) {
4680 case 0:
4681 if (s == end || consumed)
4682 goto End;
4683 errmsg = "unexpected end of data";
4684 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004685 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686 break;
4687 case 1:
4688 errmsg = "invalid start byte";
4689 startinpos = s - starts;
4690 endinpos = startinpos + 1;
4691 break;
4692 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004693 case 3:
4694 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695 errmsg = "invalid continuation byte";
4696 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004697 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004698 break;
4699 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004700 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004701 goto onError;
4702 continue;
4703 }
4704
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004705 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004706 errors, &errorHandler,
4707 "utf-8", errmsg,
4708 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004709 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004710 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004711 }
4712
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004713End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714 if (consumed)
4715 *consumed = s - starts;
4716
4717 Py_XDECREF(errorHandler);
4718 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004719 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720
4721onError:
4722 Py_XDECREF(errorHandler);
4723 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004724 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004725 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004726}
4727
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004728#ifdef __APPLE__
4729
4730/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004731 used to decode the command line arguments on Mac OS X.
4732
4733 Return a pointer to a newly allocated wide character string (use
4734 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004735
4736wchar_t*
4737_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4738{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004739 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004740 wchar_t *unicode;
4741 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004742
4743 /* Note: size will always be longer than the resulting Unicode
4744 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004745 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004746 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004747 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4748 if (!unicode)
4749 return NULL;
4750
4751 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004752 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004754 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004755 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004756#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004758#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004759 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004760#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 if (ch > 0xFF) {
4762#if SIZEOF_WCHAR_T == 4
4763 assert(0);
4764#else
4765 assert(Py_UNICODE_IS_SURROGATE(ch));
4766 /* compute and append the two surrogates: */
4767 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4768 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4769#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004770 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 else {
4772 if (!ch && s == e)
4773 break;
4774 /* surrogateescape */
4775 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4776 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004777 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004779 return unicode;
4780}
4781
4782#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004784/* Primary internal function which creates utf8 encoded bytes objects.
4785
4786 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004787 and allocate exactly as much space needed at the end. Else allocate the
4788 maximum possible needed (4 result bytes per Unicode character), and return
4789 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004790*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004791PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004792_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793{
Victor Stinner6099a032011-12-18 14:22:26 +01004794 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004795 void *data;
4796 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004798 if (!PyUnicode_Check(unicode)) {
4799 PyErr_BadArgument();
4800 return NULL;
4801 }
4802
4803 if (PyUnicode_READY(unicode) == -1)
4804 return NULL;
4805
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004806 if (PyUnicode_UTF8(unicode))
4807 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4808 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004809
4810 kind = PyUnicode_KIND(unicode);
4811 data = PyUnicode_DATA(unicode);
4812 size = PyUnicode_GET_LENGTH(unicode);
4813
Benjamin Petersonead6b532011-12-20 17:23:42 -06004814 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004815 default:
4816 assert(0);
4817 case PyUnicode_1BYTE_KIND:
4818 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4819 assert(!PyUnicode_IS_ASCII(unicode));
4820 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4821 case PyUnicode_2BYTE_KIND:
4822 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4823 case PyUnicode_4BYTE_KIND:
4824 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826}
4827
Alexander Belopolsky40018472011-02-26 01:02:56 +00004828PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4830 Py_ssize_t size,
4831 const char *errors)
4832{
4833 PyObject *v, *unicode;
4834
4835 unicode = PyUnicode_FromUnicode(s, size);
4836 if (unicode == NULL)
4837 return NULL;
4838 v = _PyUnicode_AsUTF8String(unicode, errors);
4839 Py_DECREF(unicode);
4840 return v;
4841}
4842
4843PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004844PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847}
4848
Walter Dörwald41980ca2007-08-16 21:55:45 +00004849/* --- UTF-32 Codec ------------------------------------------------------- */
4850
4851PyObject *
4852PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 Py_ssize_t size,
4854 const char *errors,
4855 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004856{
4857 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4858}
4859
4860PyObject *
4861PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 Py_ssize_t size,
4863 const char *errors,
4864 int *byteorder,
4865 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004866{
4867 const char *starts = s;
4868 Py_ssize_t startinpos;
4869 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004870 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004871 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004872 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004873 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004874 PyObject *errorHandler = NULL;
4875 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004876
Walter Dörwald41980ca2007-08-16 21:55:45 +00004877 q = (unsigned char *)s;
4878 e = q + size;
4879
4880 if (byteorder)
4881 bo = *byteorder;
4882
4883 /* Check for BOM marks (U+FEFF) in the input and adjust current
4884 byte order setting accordingly. In native mode, the leading BOM
4885 mark is skipped, in all other modes, it is copied to the output
4886 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004887 if (bo == 0 && size >= 4) {
4888 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4889 if (bom == 0x0000FEFF) {
4890 bo = -1;
4891 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004893 else if (bom == 0xFFFE0000) {
4894 bo = 1;
4895 q += 4;
4896 }
4897 if (byteorder)
4898 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004899 }
4900
Victor Stinnere64322e2012-10-30 23:12:47 +01004901 if (q == e) {
4902 if (consumed)
4903 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004904 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004905 }
4906
Victor Stinnere64322e2012-10-30 23:12:47 +01004907#ifdef WORDS_BIGENDIAN
4908 le = bo < 0;
4909#else
4910 le = bo <= 0;
4911#endif
4912
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004913 _PyUnicodeWriter_Init(&writer, 0);
4914 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4915 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004916
Victor Stinnere64322e2012-10-30 23:12:47 +01004917 while (1) {
4918 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004919 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004920
Victor Stinnere64322e2012-10-30 23:12:47 +01004921 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004922 enum PyUnicode_Kind kind = writer.kind;
4923 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004924 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004925 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004926 if (le) {
4927 do {
4928 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4929 if (ch > maxch)
4930 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004931 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004932 q += 4;
4933 } while (q <= last);
4934 }
4935 else {
4936 do {
4937 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4938 if (ch > maxch)
4939 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004940 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004941 q += 4;
4942 } while (q <= last);
4943 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004944 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004945 }
4946
4947 if (ch <= maxch) {
4948 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004950 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004952 startinpos = ((const char *)q) - starts;
4953 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004955 else {
4956 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004957 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01004958 goto onError;
4959 q += 4;
4960 continue;
4961 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004963 startinpos = ((const char *)q) - starts;
4964 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004966
4967 /* The remaining input chars are ignored if the callback
4968 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 errors, &errorHandler,
4971 "utf32", errmsg,
4972 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004973 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004975 }
4976
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980 Py_XDECREF(errorHandler);
4981 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986 Py_XDECREF(errorHandler);
4987 Py_XDECREF(exc);
4988 return NULL;
4989}
4990
4991PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004992_PyUnicode_EncodeUTF32(PyObject *str,
4993 const char *errors,
4994 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004995{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004996 int kind;
4997 void *data;
4998 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004999 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005001 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005003#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004 int iorder[] = {0, 1, 2, 3};
5005#else
5006 int iorder[] = {3, 2, 1, 0};
5007#endif
5008
Benjamin Peterson29060642009-01-31 22:14:21 +00005009#define STORECHAR(CH) \
5010 do { \
5011 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5012 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5013 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5014 p[iorder[0]] = (CH) & 0xff; \
5015 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005016 } while(0)
5017
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005018 if (!PyUnicode_Check(str)) {
5019 PyErr_BadArgument();
5020 return NULL;
5021 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005022 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005023 return NULL;
5024 kind = PyUnicode_KIND(str);
5025 data = PyUnicode_DATA(str);
5026 len = PyUnicode_GET_LENGTH(str);
5027
5028 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005029 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005031 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005032 if (v == NULL)
5033 return NULL;
5034
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005035 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005036 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005038 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005039 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005040
5041 if (byteorder == -1) {
5042 /* force LE */
5043 iorder[0] = 0;
5044 iorder[1] = 1;
5045 iorder[2] = 2;
5046 iorder[3] = 3;
5047 }
5048 else if (byteorder == 1) {
5049 /* force BE */
5050 iorder[0] = 3;
5051 iorder[1] = 2;
5052 iorder[2] = 1;
5053 iorder[3] = 0;
5054 }
5055
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005056 for (i = 0; i < len; i++)
5057 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005058
5059 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005060 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061#undef STORECHAR
5062}
5063
Alexander Belopolsky40018472011-02-26 01:02:56 +00005064PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005065PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5066 Py_ssize_t size,
5067 const char *errors,
5068 int byteorder)
5069{
5070 PyObject *result;
5071 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5072 if (tmp == NULL)
5073 return NULL;
5074 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5075 Py_DECREF(tmp);
5076 return result;
5077}
5078
5079PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005080PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081{
Victor Stinnerb960b342011-11-20 19:12:52 +01005082 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083}
5084
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085/* --- UTF-16 Codec ------------------------------------------------------- */
5086
Tim Peters772747b2001-08-09 22:21:55 +00005087PyObject *
5088PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 Py_ssize_t size,
5090 const char *errors,
5091 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092{
Walter Dörwald69652032004-09-07 20:24:22 +00005093 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5094}
5095
5096PyObject *
5097PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 Py_ssize_t size,
5099 const char *errors,
5100 int *byteorder,
5101 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005102{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005104 Py_ssize_t startinpos;
5105 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005106 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005107 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005108 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005109 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005110 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005111 PyObject *errorHandler = NULL;
5112 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113
Tim Peters772747b2001-08-09 22:21:55 +00005114 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005115 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116
5117 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005118 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005120 /* Check for BOM marks (U+FEFF) in the input and adjust current
5121 byte order setting accordingly. In native mode, the leading BOM
5122 mark is skipped, in all other modes, it is copied to the output
5123 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005124 if (bo == 0 && size >= 2) {
5125 const Py_UCS4 bom = (q[1] << 8) | q[0];
5126 if (bom == 0xFEFF) {
5127 q += 2;
5128 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005130 else if (bom == 0xFFFE) {
5131 q += 2;
5132 bo = 1;
5133 }
5134 if (byteorder)
5135 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137
Antoine Pitrou63065d72012-05-15 23:48:04 +02005138 if (q == e) {
5139 if (consumed)
5140 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005141 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005142 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005143
Christian Heimes743e0cd2012-10-17 23:52:17 +02005144#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005145 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005146#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005147 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005148#endif
Tim Peters772747b2001-08-09 22:21:55 +00005149
Antoine Pitrou63065d72012-05-15 23:48:04 +02005150 /* Note: size will always be longer than the resulting Unicode
5151 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005152 _PyUnicodeWriter_Init(&writer, 0);
5153 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5154 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005155
Antoine Pitrou63065d72012-05-15 23:48:04 +02005156 while (1) {
5157 Py_UCS4 ch = 0;
5158 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005159 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005160 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005161 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005162 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005163 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005164 native_ordering);
5165 else
5166 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005167 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005168 native_ordering);
5169 } else if (kind == PyUnicode_2BYTE_KIND) {
5170 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005171 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005172 native_ordering);
5173 } else {
5174 assert(kind == PyUnicode_4BYTE_KIND);
5175 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005176 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005177 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005178 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005179 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180
Antoine Pitrou63065d72012-05-15 23:48:04 +02005181 switch (ch)
5182 {
5183 case 0:
5184 /* remaining byte at the end? (size should be even) */
5185 if (q == e || consumed)
5186 goto End;
5187 errmsg = "truncated data";
5188 startinpos = ((const char *)q) - starts;
5189 endinpos = ((const char *)e) - starts;
5190 break;
5191 /* The remaining input chars are ignored if the callback
5192 chooses to skip the input */
5193 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005194 q -= 2;
5195 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005196 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005197 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005198 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005199 endinpos = ((const char *)e) - starts;
5200 break;
5201 case 2:
5202 errmsg = "illegal encoding";
5203 startinpos = ((const char *)q) - 2 - starts;
5204 endinpos = startinpos + 2;
5205 break;
5206 case 3:
5207 errmsg = "illegal UTF-16 surrogate";
5208 startinpos = ((const char *)q) - 4 - starts;
5209 endinpos = startinpos + 2;
5210 break;
5211 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005212 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005213 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 continue;
5215 }
5216
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005217 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005218 errors,
5219 &errorHandler,
5220 "utf16", errmsg,
5221 &starts,
5222 (const char **)&e,
5223 &startinpos,
5224 &endinpos,
5225 &exc,
5226 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005227 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 }
5230
Antoine Pitrou63065d72012-05-15 23:48:04 +02005231End:
Walter Dörwald69652032004-09-07 20:24:22 +00005232 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005234
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 Py_XDECREF(errorHandler);
5236 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005237 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005240 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 Py_XDECREF(errorHandler);
5242 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 return NULL;
5244}
5245
Tim Peters772747b2001-08-09 22:21:55 +00005246PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005247_PyUnicode_EncodeUTF16(PyObject *str,
5248 const char *errors,
5249 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005251 enum PyUnicode_Kind kind;
5252 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005253 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005254 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005255 unsigned short *out;
5256 Py_ssize_t bytesize;
5257 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005258#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005259 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005260#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005261 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005262#endif
5263
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005264 if (!PyUnicode_Check(str)) {
5265 PyErr_BadArgument();
5266 return NULL;
5267 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005268 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269 return NULL;
5270 kind = PyUnicode_KIND(str);
5271 data = PyUnicode_DATA(str);
5272 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005273
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005274 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005275 if (kind == PyUnicode_4BYTE_KIND) {
5276 const Py_UCS4 *in = (const Py_UCS4 *)data;
5277 const Py_UCS4 *end = in + len;
5278 while (in < end)
5279 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005280 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005281 }
5282 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005284 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005285 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 if (v == NULL)
5287 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005289 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005290 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005291 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005293 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005294 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005295 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005296
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005297 switch (kind) {
5298 case PyUnicode_1BYTE_KIND: {
5299 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5300 break;
Tim Peters772747b2001-08-09 22:21:55 +00005301 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005302 case PyUnicode_2BYTE_KIND: {
5303 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5304 break;
Tim Peters772747b2001-08-09 22:21:55 +00005305 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005306 case PyUnicode_4BYTE_KIND: {
5307 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5308 break;
5309 }
5310 default:
5311 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005312 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005313
5314 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005315 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316}
5317
Alexander Belopolsky40018472011-02-26 01:02:56 +00005318PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005319PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5320 Py_ssize_t size,
5321 const char *errors,
5322 int byteorder)
5323{
5324 PyObject *result;
5325 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5326 if (tmp == NULL)
5327 return NULL;
5328 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5329 Py_DECREF(tmp);
5330 return result;
5331}
5332
5333PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005334PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005336 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337}
5338
5339/* --- Unicode Escape Codec ----------------------------------------------- */
5340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005341/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5342 if all the escapes in the string make it still a valid ASCII string.
5343 Returns -1 if any escapes were found which cause the string to
5344 pop out of ASCII range. Otherwise returns the length of the
5345 required buffer to hold the string.
5346 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005347static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005348length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5349{
5350 const unsigned char *p = (const unsigned char *)s;
5351 const unsigned char *end = p + size;
5352 Py_ssize_t length = 0;
5353
5354 if (size < 0)
5355 return -1;
5356
5357 for (; p < end; ++p) {
5358 if (*p > 127) {
5359 /* Non-ASCII */
5360 return -1;
5361 }
5362 else if (*p != '\\') {
5363 /* Normal character */
5364 ++length;
5365 }
5366 else {
5367 /* Backslash-escape, check next char */
5368 ++p;
5369 /* Escape sequence reaches till end of string or
5370 non-ASCII follow-up. */
5371 if (p >= end || *p > 127)
5372 return -1;
5373 switch (*p) {
5374 case '\n':
5375 /* backslash + \n result in zero characters */
5376 break;
5377 case '\\': case '\'': case '\"':
5378 case 'b': case 'f': case 't':
5379 case 'n': case 'r': case 'v': case 'a':
5380 ++length;
5381 break;
5382 case '0': case '1': case '2': case '3':
5383 case '4': case '5': case '6': case '7':
5384 case 'x': case 'u': case 'U': case 'N':
5385 /* these do not guarantee ASCII characters */
5386 return -1;
5387 default:
5388 /* count the backslash + the other character */
5389 length += 2;
5390 }
5391 }
5392 }
5393 return length;
5394}
5395
Fredrik Lundh06d12682001-01-24 07:59:11 +00005396static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005397
Alexander Belopolsky40018472011-02-26 01:02:56 +00005398PyObject *
5399PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005400 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005401 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005404 Py_ssize_t startinpos;
5405 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005406 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005408 char* message;
5409 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410 PyObject *errorHandler = NULL;
5411 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005412 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005413
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005414 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005415 if (len == 0)
5416 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005417
5418 /* After length_of_escaped_ascii_string() there are two alternatives,
5419 either the string is pure ASCII with named escapes like \n, etc.
5420 and we determined it's exact size (common case)
5421 or it contains \x, \u, ... escape sequences. then we create a
5422 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005423 _PyUnicodeWriter_Init(&writer, 0);
5424 if (len > 0) {
5425 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005427 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005428 }
5429 else {
5430 /* Escaped strings will always be longer than the resulting
5431 Unicode string, so we start with size here and then reduce the
5432 length after conversion to the true value.
5433 (but if the error callback returns a long replacement string
5434 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005435 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005437 }
5438
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005440 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 while (s < end) {
5444 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005445 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447
5448 /* Non-escape characters are interpreted as Unicode ordinals */
5449 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005450 x = (unsigned char)*s;
5451 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005452 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005453 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 continue;
5455 }
5456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 /* \ - Escapes */
5459 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005460 c = *s++;
5461 if (s > end)
5462 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005464 /* The only case in which i == ascii_length is a backslash
5465 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005466 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005467
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005468 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005471#define WRITECHAR(ch) \
5472 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005473 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005474 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005475 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005476
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005478 case '\\': WRITECHAR('\\'); break;
5479 case '\'': WRITECHAR('\''); break;
5480 case '\"': WRITECHAR('\"'); break;
5481 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005483 case 'f': WRITECHAR('\014'); break;
5484 case 't': WRITECHAR('\t'); break;
5485 case 'n': WRITECHAR('\n'); break;
5486 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005487 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005488 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005489 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005490 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 case '0': case '1': case '2': case '3':
5494 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005495 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005496 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005497 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005498 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005499 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005501 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 break;
5503
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 /* hex escapes */
5505 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005507 digits = 2;
5508 message = "truncated \\xXX escape";
5509 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005513 digits = 4;
5514 message = "truncated \\uXXXX escape";
5515 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005518 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005519 digits = 8;
5520 message = "truncated \\UXXXXXXXX escape";
5521 hexescape:
5522 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005523 if (end - s < digits) {
5524 /* count only hex digits */
5525 for (; s < end; ++s) {
5526 c = (unsigned char)*s;
5527 if (!Py_ISXDIGIT(c))
5528 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005529 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005530 goto error;
5531 }
5532 for (; digits--; ++s) {
5533 c = (unsigned char)*s;
5534 if (!Py_ISXDIGIT(c))
5535 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005536 chr = (chr<<4) & ~0xF;
5537 if (c >= '0' && c <= '9')
5538 chr += c - '0';
5539 else if (c >= 'a' && c <= 'f')
5540 chr += 10 + c - 'a';
5541 else
5542 chr += 10 + c - 'A';
5543 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005544 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005545 /* _decoding_error will have already written into the
5546 target buffer. */
5547 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005548 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005549 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005550 message = "illegal Unicode character";
5551 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005552 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005553 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 break;
5555
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005557 case 'N':
5558 message = "malformed \\N character escape";
5559 if (ucnhash_CAPI == NULL) {
5560 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5562 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005563 if (ucnhash_CAPI == NULL)
5564 goto ucnhashError;
5565 }
5566 if (*s == '{') {
5567 const char *start = s+1;
5568 /* look for the closing brace */
5569 while (*s != '}' && s < end)
5570 s++;
5571 if (s > start && s < end && *s == '}') {
5572 /* found a name. look it up in the unicode database */
5573 message = "unknown Unicode character name";
5574 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005575 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005576 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005577 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005578 goto store;
5579 }
5580 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005581 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005582
5583 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005584 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585 message = "\\ at end of string";
5586 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005587 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005588 }
5589 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005590 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005591 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005592 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005593 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005595 continue;
5596
5597 error:
5598 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005599 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005600 errors, &errorHandler,
5601 "unicodeescape", message,
5602 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005603 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005604 goto onError;
5605 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005607#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005609 Py_XDECREF(errorHandler);
5610 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005611 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005612
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005614 PyErr_SetString(
5615 PyExc_UnicodeError,
5616 "\\N escapes not supported (can't load unicodedata module)"
5617 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 Py_XDECREF(errorHandler);
5620 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005621 return NULL;
5622
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 Py_XDECREF(errorHandler);
5626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 return NULL;
5628}
5629
5630/* Return a Unicode-Escape string version of the Unicode object.
5631
5632 If quotes is true, the string is enclosed in u"" or u'' quotes as
5633 appropriate.
5634
5635*/
5636
Alexander Belopolsky40018472011-02-26 01:02:56 +00005637PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005638PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005640 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005641 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 int kind;
5644 void *data;
5645 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646
Ezio Melottie7f90372012-10-05 03:33:31 +03005647 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005648 escape.
5649
Ezio Melottie7f90372012-10-05 03:33:31 +03005650 For UCS1 strings it's '\xxx', 4 bytes per source character.
5651 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5652 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005653 */
5654
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005655 if (!PyUnicode_Check(unicode)) {
5656 PyErr_BadArgument();
5657 return NULL;
5658 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005659 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660 return NULL;
5661 len = PyUnicode_GET_LENGTH(unicode);
5662 kind = PyUnicode_KIND(unicode);
5663 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005664 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005665 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5666 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5667 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5668 }
5669
5670 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005671 return PyBytes_FromStringAndSize(NULL, 0);
5672
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005673 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005675
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005676 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005678 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 if (repr == NULL)
5681 return NULL;
5682
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005683 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005685 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005686 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005687
Walter Dörwald79e913e2007-05-12 11:08:06 +00005688 /* Escape backslashes */
5689 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 *p++ = '\\';
5691 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005692 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005693 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005694
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005695 /* Map 21-bit characters to '\U00xxxxxx' */
5696 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005697 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005698 *p++ = '\\';
5699 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005700 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5701 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5702 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5703 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5704 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5705 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5706 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5707 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005709 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005710
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005712 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 *p++ = '\\';
5714 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005715 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5716 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5717 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5718 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005720
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005721 /* Map special whitespace to '\t', \n', '\r' */
5722 else if (ch == '\t') {
5723 *p++ = '\\';
5724 *p++ = 't';
5725 }
5726 else if (ch == '\n') {
5727 *p++ = '\\';
5728 *p++ = 'n';
5729 }
5730 else if (ch == '\r') {
5731 *p++ = '\\';
5732 *p++ = 'r';
5733 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005734
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005735 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005736 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005738 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005739 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5740 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005741 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005742
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 /* Copy everything else as-is */
5744 else
5745 *p++ = (char) ch;
5746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005748 assert(p - PyBytes_AS_STRING(repr) > 0);
5749 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5750 return NULL;
5751 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752}
5753
Alexander Belopolsky40018472011-02-26 01:02:56 +00005754PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5756 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005758 PyObject *result;
5759 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5760 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005762 result = PyUnicode_AsUnicodeEscapeString(tmp);
5763 Py_DECREF(tmp);
5764 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765}
5766
5767/* --- Raw Unicode Escape Codec ------------------------------------------- */
5768
Alexander Belopolsky40018472011-02-26 01:02:56 +00005769PyObject *
5770PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005771 Py_ssize_t size,
5772 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005775 Py_ssize_t startinpos;
5776 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 const char *end;
5779 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 PyObject *errorHandler = NULL;
5781 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005782
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005783 if (size == 0)
5784 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 /* Escaped strings will always be longer than the resulting
5787 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 length after conversion to the true value. (But decoding error
5789 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005790 _PyUnicodeWriter_Init(&writer, 1);
5791 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005793
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 end = s + size;
5795 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 unsigned char c;
5797 Py_UCS4 x;
5798 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005799 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 /* Non-escape characters are interpreted as Unicode ordinals */
5802 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005803 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005804 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005805 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005807 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 startinpos = s-starts;
5809
5810 /* \u-escapes are only interpreted iff the number of leading
5811 backslashes if odd */
5812 bs = s;
5813 for (;s < end;) {
5814 if (*s != '\\')
5815 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005816 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005817 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005818 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 }
5820 if (((s - bs) & 1) == 0 ||
5821 s >= end ||
5822 (*s != 'u' && *s != 'U')) {
5823 continue;
5824 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005825 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 count = *s=='u' ? 4 : 8;
5827 s++;
5828
5829 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 for (x = 0, i = 0; i < count; ++i, ++s) {
5831 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005832 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005834 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 errors, &errorHandler,
5836 "rawunicodeescape", "truncated \\uXXXX",
5837 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005838 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 goto onError;
5840 goto nextByte;
5841 }
5842 x = (x<<4) & ~0xF;
5843 if (c >= '0' && c <= '9')
5844 x += c - '0';
5845 else if (c >= 'a' && c <= 'f')
5846 x += 10 + c - 'a';
5847 else
5848 x += 10 + c - 'A';
5849 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005850 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005851 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005852 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 }
5854 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005855 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005856 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005857 errors, &errorHandler,
5858 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005860 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005862 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 nextByte:
5864 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 Py_XDECREF(errorHandler);
5867 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005868 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005871 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872 Py_XDECREF(errorHandler);
5873 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 return NULL;
5875}
5876
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005877
Alexander Belopolsky40018472011-02-26 01:02:56 +00005878PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005881 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 char *p;
5883 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005884 Py_ssize_t expandsize, pos;
5885 int kind;
5886 void *data;
5887 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005889 if (!PyUnicode_Check(unicode)) {
5890 PyErr_BadArgument();
5891 return NULL;
5892 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005893 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 return NULL;
5895 kind = PyUnicode_KIND(unicode);
5896 data = PyUnicode_DATA(unicode);
5897 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005898 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5899 bytes, and 1 byte characters 4. */
5900 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005901
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005904
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005905 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 if (repr == NULL)
5907 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005909 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005911 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005912 for (pos = 0; pos < len; pos++) {
5913 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 /* Map 32-bit characters to '\Uxxxxxxxx' */
5915 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005916 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005917 *p++ = '\\';
5918 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005919 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5920 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5921 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5922 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5923 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5924 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5925 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5926 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005927 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005929 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 *p++ = '\\';
5931 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005932 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5933 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5934 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5935 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 /* Copy everything else as-is */
5938 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 *p++ = (char) ch;
5940 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005941
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005942 assert(p > q);
5943 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 return NULL;
5945 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946}
5947
Alexander Belopolsky40018472011-02-26 01:02:56 +00005948PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5950 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952 PyObject *result;
5953 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5954 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005955 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5957 Py_DECREF(tmp);
5958 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959}
5960
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005961/* --- Unicode Internal Codec ------------------------------------------- */
5962
Alexander Belopolsky40018472011-02-26 01:02:56 +00005963PyObject *
5964_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005965 Py_ssize_t size,
5966 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005967{
5968 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005969 Py_ssize_t startinpos;
5970 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005971 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005972 const char *end;
5973 const char *reason;
5974 PyObject *errorHandler = NULL;
5975 PyObject *exc = NULL;
5976
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005977 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005978 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005979 1))
5980 return NULL;
5981
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005982 if (size == 0)
5983 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005984
Thomas Wouters89f507f2006-12-13 04:49:30 +00005985 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005986 _PyUnicodeWriter_Init(&writer, 0);
5987 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005989 end = s + size;
5990
5991 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005992 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005993 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02005994 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02005995 endinpos = end-starts;
5996 reason = "truncated input";
5997 goto error;
5998 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005999 /* We copy the raw representation one byte at a time because the
6000 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006001 ((char *) &uch)[0] = s[0];
6002 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006003#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006004 ((char *) &uch)[2] = s[2];
6005 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006006#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006007 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006008#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006009 /* We have to sanity check the raw data, otherwise doom looms for
6010 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006011 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006012 endinpos = s - starts + Py_UNICODE_SIZE;
6013 reason = "illegal code point (> 0x10FFFF)";
6014 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006015 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006016#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006017 s += Py_UNICODE_SIZE;
6018#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006019 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006020 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006021 Py_UNICODE uch2;
6022 ((char *) &uch2)[0] = s[0];
6023 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006024 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006025 {
Victor Stinner551ac952011-11-29 22:58:13 +01006026 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006027 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006028 }
6029 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006030#endif
6031
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006032 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006033 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006034 continue;
6035
6036 error:
6037 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006038 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006039 errors, &errorHandler,
6040 "unicode_internal", reason,
6041 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006042 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006043 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006044 }
6045
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006046 Py_XDECREF(errorHandler);
6047 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006048 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006049
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006051 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006052 Py_XDECREF(errorHandler);
6053 Py_XDECREF(exc);
6054 return NULL;
6055}
6056
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057/* --- Latin-1 Codec ------------------------------------------------------ */
6058
Alexander Belopolsky40018472011-02-26 01:02:56 +00006059PyObject *
6060PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006061 Py_ssize_t size,
6062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006065 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066}
6067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006069static void
6070make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006071 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006072 PyObject *unicode,
6073 Py_ssize_t startpos, Py_ssize_t endpos,
6074 const char *reason)
6075{
6076 if (*exceptionObject == NULL) {
6077 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006078 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006079 encoding, unicode, startpos, endpos, reason);
6080 }
6081 else {
6082 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6083 goto onError;
6084 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6085 goto onError;
6086 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6087 goto onError;
6088 return;
6089 onError:
6090 Py_DECREF(*exceptionObject);
6091 *exceptionObject = NULL;
6092 }
6093}
6094
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006096static void
6097raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006098 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006099 PyObject *unicode,
6100 Py_ssize_t startpos, Py_ssize_t endpos,
6101 const char *reason)
6102{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006103 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006104 encoding, unicode, startpos, endpos, reason);
6105 if (*exceptionObject != NULL)
6106 PyCodec_StrictErrors(*exceptionObject);
6107}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108
6109/* error handling callback helper:
6110 build arguments, call the callback and check the arguments,
6111 put the result into newpos and return the replacement string, which
6112 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006113static PyObject *
6114unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006115 PyObject **errorHandler,
6116 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006117 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006118 Py_ssize_t startpos, Py_ssize_t endpos,
6119 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006121 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006122 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 PyObject *restuple;
6124 PyObject *resunicode;
6125
6126 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006130 }
6131
Benjamin Petersonbac79492012-01-14 13:34:47 -05006132 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006133 return NULL;
6134 len = PyUnicode_GET_LENGTH(unicode);
6135
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006136 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006137 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140
6141 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006146 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 Py_DECREF(restuple);
6148 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006150 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 &resunicode, newpos)) {
6152 Py_DECREF(restuple);
6153 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006155 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6156 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6157 Py_DECREF(restuple);
6158 return NULL;
6159 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006161 *newpos = len + *newpos;
6162 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6164 Py_DECREF(restuple);
6165 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006166 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167 Py_INCREF(resunicode);
6168 Py_DECREF(restuple);
6169 return resunicode;
6170}
6171
Alexander Belopolsky40018472011-02-26 01:02:56 +00006172static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006173unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006174 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006175 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006177 /* input state */
6178 Py_ssize_t pos=0, size;
6179 int kind;
6180 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 /* output object */
6182 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183 /* pointer into the output */
6184 char *str;
6185 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006186 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006187 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6188 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006189 PyObject *errorHandler = NULL;
6190 PyObject *exc = NULL;
6191 /* the following variable is used for caching string comparisons
6192 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6193 int known_errorHandler = -1;
6194
Benjamin Petersonbac79492012-01-14 13:34:47 -05006195 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006196 return NULL;
6197 size = PyUnicode_GET_LENGTH(unicode);
6198 kind = PyUnicode_KIND(unicode);
6199 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006200 /* allocate enough for a simple encoding without
6201 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006202 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006203 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006204 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006205 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006206 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006207 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 ressize = size;
6209
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006210 while (pos < size) {
6211 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006212
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 /* can we encode this? */
6214 if (c<limit) {
6215 /* no overflow check, because we know that the space is enough */
6216 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006217 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006218 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 Py_ssize_t requiredsize;
6221 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006222 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006224 Py_ssize_t collstart = pos;
6225 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006227 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 ++collend;
6229 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6230 if (known_errorHandler==-1) {
6231 if ((errors==NULL) || (!strcmp(errors, "strict")))
6232 known_errorHandler = 1;
6233 else if (!strcmp(errors, "replace"))
6234 known_errorHandler = 2;
6235 else if (!strcmp(errors, "ignore"))
6236 known_errorHandler = 3;
6237 else if (!strcmp(errors, "xmlcharrefreplace"))
6238 known_errorHandler = 4;
6239 else
6240 known_errorHandler = 0;
6241 }
6242 switch (known_errorHandler) {
6243 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006244 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 goto onError;
6246 case 2: /* replace */
6247 while (collstart++<collend)
6248 *str++ = '?'; /* fall through */
6249 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006250 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 break;
6252 case 4: /* xmlcharrefreplace */
6253 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006254 /* determine replacement size */
6255 for (i = collstart, repsize = 0; i < collend; ++i) {
6256 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6257 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006259 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006261 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006263 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006265 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006267 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006269 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006270 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006272 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006274 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 if (requiredsize > ressize) {
6276 if (requiredsize<2*ressize)
6277 requiredsize = 2*ressize;
6278 if (_PyBytes_Resize(&res, requiredsize))
6279 goto onError;
6280 str = PyBytes_AS_STRING(res) + respos;
6281 ressize = requiredsize;
6282 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006283 /* generate replacement */
6284 for (i = collstart; i < collend; ++i) {
6285 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006287 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 break;
6289 default:
6290 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006291 encoding, reason, unicode, &exc,
6292 collstart, collend, &newpos);
6293 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006294 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006296 if (PyBytes_Check(repunicode)) {
6297 /* Directly copy bytes result to output. */
6298 repsize = PyBytes_Size(repunicode);
6299 if (repsize > 1) {
6300 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006301 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006302 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6303 Py_DECREF(repunicode);
6304 goto onError;
6305 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006306 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006307 ressize += repsize-1;
6308 }
6309 memcpy(str, PyBytes_AsString(repunicode), repsize);
6310 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006311 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006312 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006313 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006314 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 /* need more space? (at least enough for what we
6316 have+the replacement+the rest of the string, so
6317 we won't have to check space for encodable characters) */
6318 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006319 repsize = PyUnicode_GET_LENGTH(repunicode);
6320 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 if (requiredsize > ressize) {
6322 if (requiredsize<2*ressize)
6323 requiredsize = 2*ressize;
6324 if (_PyBytes_Resize(&res, requiredsize)) {
6325 Py_DECREF(repunicode);
6326 goto onError;
6327 }
6328 str = PyBytes_AS_STRING(res) + respos;
6329 ressize = requiredsize;
6330 }
6331 /* check if there is anything unencodable in the replacement
6332 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006333 for (i = 0; repsize-->0; ++i, ++str) {
6334 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006336 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006337 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 Py_DECREF(repunicode);
6339 goto onError;
6340 }
6341 *str = (char)c;
6342 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006343 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006344 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006346 }
6347 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006348 /* Resize if we allocated to much */
6349 size = str - PyBytes_AS_STRING(res);
6350 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006351 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006352 if (_PyBytes_Resize(&res, size) < 0)
6353 goto onError;
6354 }
6355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 Py_XDECREF(errorHandler);
6357 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006358 return res;
6359
6360 onError:
6361 Py_XDECREF(res);
6362 Py_XDECREF(errorHandler);
6363 Py_XDECREF(exc);
6364 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365}
6366
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006367/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006368PyObject *
6369PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006370 Py_ssize_t size,
6371 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006373 PyObject *result;
6374 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6375 if (unicode == NULL)
6376 return NULL;
6377 result = unicode_encode_ucs1(unicode, errors, 256);
6378 Py_DECREF(unicode);
6379 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380}
6381
Alexander Belopolsky40018472011-02-26 01:02:56 +00006382PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006383_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384{
6385 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 PyErr_BadArgument();
6387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006389 if (PyUnicode_READY(unicode) == -1)
6390 return NULL;
6391 /* Fast path: if it is a one-byte string, construct
6392 bytes object directly. */
6393 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6394 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6395 PyUnicode_GET_LENGTH(unicode));
6396 /* Non-Latin-1 characters present. Defer to above function to
6397 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006399}
6400
6401PyObject*
6402PyUnicode_AsLatin1String(PyObject *unicode)
6403{
6404 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
6407/* --- 7-bit ASCII Codec -------------------------------------------------- */
6408
Alexander Belopolsky40018472011-02-26 01:02:56 +00006409PyObject *
6410PyUnicode_DecodeASCII(const char *s,
6411 Py_ssize_t size,
6412 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006415 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006416 int kind;
6417 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006418 Py_ssize_t startinpos;
6419 Py_ssize_t endinpos;
6420 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 const char *e;
6422 PyObject *errorHandler = NULL;
6423 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006424
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006426 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006427
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006429 if (size == 1 && (unsigned char)s[0] < 128)
6430 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006431
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006432 _PyUnicodeWriter_Init(&writer, 0);
6433 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006437 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006438 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006439 writer.pos = outpos;
6440 if (writer.pos == size)
6441 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006442
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006443 s += writer.pos;
6444 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 register unsigned char c = (unsigned char)*s;
6447 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006448 PyUnicode_WRITE(kind, data, writer.pos, c);
6449 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 ++s;
6451 }
6452 else {
6453 startinpos = s-starts;
6454 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006455 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 errors, &errorHandler,
6457 "ascii", "ordinal not in range(128)",
6458 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006459 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006461 kind = writer.kind;
6462 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 Py_XDECREF(errorHandler);
6466 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006467 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006468
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006470 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471 Py_XDECREF(errorHandler);
6472 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 return NULL;
6474}
6475
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006476/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006477PyObject *
6478PyUnicode_EncodeASCII(const Py_UNICODE *p,
6479 Py_ssize_t size,
6480 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006482 PyObject *result;
6483 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6484 if (unicode == NULL)
6485 return NULL;
6486 result = unicode_encode_ucs1(unicode, errors, 128);
6487 Py_DECREF(unicode);
6488 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489}
6490
Alexander Belopolsky40018472011-02-26 01:02:56 +00006491PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006492_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493{
6494 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 PyErr_BadArgument();
6496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006498 if (PyUnicode_READY(unicode) == -1)
6499 return NULL;
6500 /* Fast path: if it is an ASCII-only string, construct bytes object
6501 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006502 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006503 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6504 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006506}
6507
6508PyObject *
6509PyUnicode_AsASCIIString(PyObject *unicode)
6510{
6511 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512}
6513
Victor Stinner99b95382011-07-04 14:23:54 +02006514#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006515
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006516/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006517
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006518#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006519#define NEED_RETRY
6520#endif
6521
Victor Stinner3a50e702011-10-18 21:21:00 +02006522#ifndef WC_ERR_INVALID_CHARS
6523# define WC_ERR_INVALID_CHARS 0x0080
6524#endif
6525
6526static char*
6527code_page_name(UINT code_page, PyObject **obj)
6528{
6529 *obj = NULL;
6530 if (code_page == CP_ACP)
6531 return "mbcs";
6532 if (code_page == CP_UTF7)
6533 return "CP_UTF7";
6534 if (code_page == CP_UTF8)
6535 return "CP_UTF8";
6536
6537 *obj = PyBytes_FromFormat("cp%u", code_page);
6538 if (*obj == NULL)
6539 return NULL;
6540 return PyBytes_AS_STRING(*obj);
6541}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006542
Alexander Belopolsky40018472011-02-26 01:02:56 +00006543static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006544is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006545{
6546 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006547 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006548
Victor Stinner3a50e702011-10-18 21:21:00 +02006549 if (!IsDBCSLeadByteEx(code_page, *curr))
6550 return 0;
6551
6552 prev = CharPrevExA(code_page, s, curr, 0);
6553 if (prev == curr)
6554 return 1;
6555 /* FIXME: This code is limited to "true" double-byte encodings,
6556 as it assumes an incomplete character consists of a single
6557 byte. */
6558 if (curr - prev == 2)
6559 return 1;
6560 if (!IsDBCSLeadByteEx(code_page, *prev))
6561 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006562 return 0;
6563}
6564
Victor Stinner3a50e702011-10-18 21:21:00 +02006565static DWORD
6566decode_code_page_flags(UINT code_page)
6567{
6568 if (code_page == CP_UTF7) {
6569 /* The CP_UTF7 decoder only supports flags=0 */
6570 return 0;
6571 }
6572 else
6573 return MB_ERR_INVALID_CHARS;
6574}
6575
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006576/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006577 * Decode a byte string from a Windows code page into unicode object in strict
6578 * mode.
6579 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006580 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6581 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006582 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006583static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006584decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006585 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006586 const char *in,
6587 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006588{
Victor Stinner3a50e702011-10-18 21:21:00 +02006589 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006590 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006591 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006592
6593 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006594 assert(insize > 0);
6595 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6596 if (outsize <= 0)
6597 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006598
6599 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006601 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006602 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 if (*v == NULL)
6604 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006605 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006606 }
6607 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006609 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006610 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006612 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006613 }
6614
6615 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006616 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6617 if (outsize <= 0)
6618 goto error;
6619 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006620
Victor Stinner3a50e702011-10-18 21:21:00 +02006621error:
6622 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6623 return -2;
6624 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006625 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626}
6627
Victor Stinner3a50e702011-10-18 21:21:00 +02006628/*
6629 * Decode a byte string from a code page into unicode object with an error
6630 * handler.
6631 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006632 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006633 * UnicodeDecodeError exception and returns -1 on error.
6634 */
6635static int
6636decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006637 PyObject **v,
6638 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006639 const char *errors)
6640{
6641 const char *startin = in;
6642 const char *endin = in + size;
6643 const DWORD flags = decode_code_page_flags(code_page);
6644 /* Ideally, we should get reason from FormatMessage. This is the Windows
6645 2000 English version of the message. */
6646 const char *reason = "No mapping for the Unicode character exists "
6647 "in the target code page.";
6648 /* each step cannot decode more than 1 character, but a character can be
6649 represented as a surrogate pair */
6650 wchar_t buffer[2], *startout, *out;
6651 int insize, outsize;
6652 PyObject *errorHandler = NULL;
6653 PyObject *exc = NULL;
6654 PyObject *encoding_obj = NULL;
6655 char *encoding;
6656 DWORD err;
6657 int ret = -1;
6658
6659 assert(size > 0);
6660
6661 encoding = code_page_name(code_page, &encoding_obj);
6662 if (encoding == NULL)
6663 return -1;
6664
6665 if (errors == NULL || strcmp(errors, "strict") == 0) {
6666 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6667 UnicodeDecodeError. */
6668 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6669 if (exc != NULL) {
6670 PyCodec_StrictErrors(exc);
6671 Py_CLEAR(exc);
6672 }
6673 goto error;
6674 }
6675
6676 if (*v == NULL) {
6677 /* Create unicode object */
6678 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6679 PyErr_NoMemory();
6680 goto error;
6681 }
Victor Stinnerab595942011-12-17 04:59:06 +01006682 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006683 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006684 if (*v == NULL)
6685 goto error;
6686 startout = PyUnicode_AS_UNICODE(*v);
6687 }
6688 else {
6689 /* Extend unicode object */
6690 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6691 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6692 PyErr_NoMemory();
6693 goto error;
6694 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006695 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006696 goto error;
6697 startout = PyUnicode_AS_UNICODE(*v) + n;
6698 }
6699
6700 /* Decode the byte string character per character */
6701 out = startout;
6702 while (in < endin)
6703 {
6704 /* Decode a character */
6705 insize = 1;
6706 do
6707 {
6708 outsize = MultiByteToWideChar(code_page, flags,
6709 in, insize,
6710 buffer, Py_ARRAY_LENGTH(buffer));
6711 if (outsize > 0)
6712 break;
6713 err = GetLastError();
6714 if (err != ERROR_NO_UNICODE_TRANSLATION
6715 && err != ERROR_INSUFFICIENT_BUFFER)
6716 {
6717 PyErr_SetFromWindowsErr(0);
6718 goto error;
6719 }
6720 insize++;
6721 }
6722 /* 4=maximum length of a UTF-8 sequence */
6723 while (insize <= 4 && (in + insize) <= endin);
6724
6725 if (outsize <= 0) {
6726 Py_ssize_t startinpos, endinpos, outpos;
6727
6728 startinpos = in - startin;
6729 endinpos = startinpos + 1;
6730 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006731 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006732 errors, &errorHandler,
6733 encoding, reason,
6734 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006735 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006736 {
6737 goto error;
6738 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006739 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006740 }
6741 else {
6742 in += insize;
6743 memcpy(out, buffer, outsize * sizeof(wchar_t));
6744 out += outsize;
6745 }
6746 }
6747
6748 /* write a NUL character at the end */
6749 *out = 0;
6750
6751 /* Extend unicode object */
6752 outsize = out - startout;
6753 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006754 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006755 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006756 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006757
6758error:
6759 Py_XDECREF(encoding_obj);
6760 Py_XDECREF(errorHandler);
6761 Py_XDECREF(exc);
6762 return ret;
6763}
6764
Victor Stinner3a50e702011-10-18 21:21:00 +02006765static PyObject *
6766decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006767 const char *s, Py_ssize_t size,
6768 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006769{
Victor Stinner76a31a62011-11-04 00:05:13 +01006770 PyObject *v = NULL;
6771 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006772
Victor Stinner3a50e702011-10-18 21:21:00 +02006773 if (code_page < 0) {
6774 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6775 return NULL;
6776 }
6777
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006778 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006780
Victor Stinner76a31a62011-11-04 00:05:13 +01006781 do
6782 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006783#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006784 if (size > INT_MAX) {
6785 chunk_size = INT_MAX;
6786 final = 0;
6787 done = 0;
6788 }
6789 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006791 {
6792 chunk_size = (int)size;
6793 final = (consumed == NULL);
6794 done = 1;
6795 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796
Victor Stinner76a31a62011-11-04 00:05:13 +01006797 /* Skip trailing lead-byte unless 'final' is set */
6798 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6799 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006800
Victor Stinner76a31a62011-11-04 00:05:13 +01006801 if (chunk_size == 0 && done) {
6802 if (v != NULL)
6803 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006804 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006805 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806
Victor Stinner76a31a62011-11-04 00:05:13 +01006807
6808 converted = decode_code_page_strict(code_page, &v,
6809 s, chunk_size);
6810 if (converted == -2)
6811 converted = decode_code_page_errors(code_page, &v,
6812 s, chunk_size,
6813 errors);
6814 assert(converted != 0);
6815
6816 if (converted < 0) {
6817 Py_XDECREF(v);
6818 return NULL;
6819 }
6820
6821 if (consumed)
6822 *consumed += converted;
6823
6824 s += converted;
6825 size -= converted;
6826 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006827
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006828 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829}
6830
Alexander Belopolsky40018472011-02-26 01:02:56 +00006831PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006832PyUnicode_DecodeCodePageStateful(int code_page,
6833 const char *s,
6834 Py_ssize_t size,
6835 const char *errors,
6836 Py_ssize_t *consumed)
6837{
6838 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6839}
6840
6841PyObject *
6842PyUnicode_DecodeMBCSStateful(const char *s,
6843 Py_ssize_t size,
6844 const char *errors,
6845 Py_ssize_t *consumed)
6846{
6847 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6848}
6849
6850PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006851PyUnicode_DecodeMBCS(const char *s,
6852 Py_ssize_t size,
6853 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006854{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6856}
6857
Victor Stinner3a50e702011-10-18 21:21:00 +02006858static DWORD
6859encode_code_page_flags(UINT code_page, const char *errors)
6860{
6861 if (code_page == CP_UTF8) {
6862 if (winver.dwMajorVersion >= 6)
6863 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6864 and later */
6865 return WC_ERR_INVALID_CHARS;
6866 else
6867 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6868 return 0;
6869 }
6870 else if (code_page == CP_UTF7) {
6871 /* CP_UTF7 only supports flags=0 */
6872 return 0;
6873 }
6874 else {
6875 if (errors != NULL && strcmp(errors, "replace") == 0)
6876 return 0;
6877 else
6878 return WC_NO_BEST_FIT_CHARS;
6879 }
6880}
6881
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006882/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 * Encode a Unicode string to a Windows code page into a byte string in strict
6884 * mode.
6885 *
6886 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006887 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006889static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006890encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006891 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893{
Victor Stinner554f3f02010-06-16 23:33:54 +00006894 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006895 BOOL *pusedDefaultChar = &usedDefaultChar;
6896 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006897 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006898 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006899 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006900 const DWORD flags = encode_code_page_flags(code_page, NULL);
6901 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006902 /* Create a substring so that we can get the UTF-16 representation
6903 of just the slice under consideration. */
6904 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905
Martin v. Löwis3d325192011-11-04 18:23:06 +01006906 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006907
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006909 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006910 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006911 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006912
Victor Stinner2fc507f2011-11-04 20:06:39 +01006913 substring = PyUnicode_Substring(unicode, offset, offset+len);
6914 if (substring == NULL)
6915 return -1;
6916 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6917 if (p == NULL) {
6918 Py_DECREF(substring);
6919 return -1;
6920 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006921
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006922 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 outsize = WideCharToMultiByte(code_page, flags,
6924 p, size,
6925 NULL, 0,
6926 NULL, pusedDefaultChar);
6927 if (outsize <= 0)
6928 goto error;
6929 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006930 if (pusedDefaultChar && *pusedDefaultChar) {
6931 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006932 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006933 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006934
Victor Stinner3a50e702011-10-18 21:21:00 +02006935 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006937 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006938 if (*outbytes == NULL) {
6939 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006941 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006942 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006943 }
6944 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006946 const Py_ssize_t n = PyBytes_Size(*outbytes);
6947 if (outsize > PY_SSIZE_T_MAX - n) {
6948 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006949 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006951 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006952 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6953 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006954 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006955 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006956 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006957 }
6958
6959 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006960 outsize = WideCharToMultiByte(code_page, flags,
6961 p, size,
6962 out, outsize,
6963 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006964 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 if (outsize <= 0)
6966 goto error;
6967 if (pusedDefaultChar && *pusedDefaultChar)
6968 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006970
Victor Stinner3a50e702011-10-18 21:21:00 +02006971error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006972 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6974 return -2;
6975 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006976 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006977}
6978
Victor Stinner3a50e702011-10-18 21:21:00 +02006979/*
6980 * Encode a Unicode string to a Windows code page into a byte string using a
6981 * error handler.
6982 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006983 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02006984 * -1 on other error.
6985 */
6986static int
6987encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01006988 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006989 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006990{
Victor Stinner3a50e702011-10-18 21:21:00 +02006991 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006992 Py_ssize_t pos = unicode_offset;
6993 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006994 /* Ideally, we should get reason from FormatMessage. This is the Windows
6995 2000 English version of the message. */
6996 const char *reason = "invalid character";
6997 /* 4=maximum length of a UTF-8 sequence */
6998 char buffer[4];
6999 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7000 Py_ssize_t outsize;
7001 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007002 PyObject *errorHandler = NULL;
7003 PyObject *exc = NULL;
7004 PyObject *encoding_obj = NULL;
7005 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007006 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007007 PyObject *rep;
7008 int ret = -1;
7009
7010 assert(insize > 0);
7011
7012 encoding = code_page_name(code_page, &encoding_obj);
7013 if (encoding == NULL)
7014 return -1;
7015
7016 if (errors == NULL || strcmp(errors, "strict") == 0) {
7017 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7018 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007019 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007020 if (exc != NULL) {
7021 PyCodec_StrictErrors(exc);
7022 Py_DECREF(exc);
7023 }
7024 Py_XDECREF(encoding_obj);
7025 return -1;
7026 }
7027
7028 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7029 pusedDefaultChar = &usedDefaultChar;
7030 else
7031 pusedDefaultChar = NULL;
7032
7033 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7034 PyErr_NoMemory();
7035 goto error;
7036 }
7037 outsize = insize * Py_ARRAY_LENGTH(buffer);
7038
7039 if (*outbytes == NULL) {
7040 /* Create string object */
7041 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7042 if (*outbytes == NULL)
7043 goto error;
7044 out = PyBytes_AS_STRING(*outbytes);
7045 }
7046 else {
7047 /* Extend string object */
7048 Py_ssize_t n = PyBytes_Size(*outbytes);
7049 if (n > PY_SSIZE_T_MAX - outsize) {
7050 PyErr_NoMemory();
7051 goto error;
7052 }
7053 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7054 goto error;
7055 out = PyBytes_AS_STRING(*outbytes) + n;
7056 }
7057
7058 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007059 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007060 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007061 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7062 wchar_t chars[2];
7063 int charsize;
7064 if (ch < 0x10000) {
7065 chars[0] = (wchar_t)ch;
7066 charsize = 1;
7067 }
7068 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007069 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7070 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007071 charsize = 2;
7072 }
7073
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007075 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007076 buffer, Py_ARRAY_LENGTH(buffer),
7077 NULL, pusedDefaultChar);
7078 if (outsize > 0) {
7079 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7080 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007081 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 memcpy(out, buffer, outsize);
7083 out += outsize;
7084 continue;
7085 }
7086 }
7087 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7088 PyErr_SetFromWindowsErr(0);
7089 goto error;
7090 }
7091
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 rep = unicode_encode_call_errorhandler(
7093 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007094 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007095 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007096 if (rep == NULL)
7097 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007098 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007099
7100 if (PyBytes_Check(rep)) {
7101 outsize = PyBytes_GET_SIZE(rep);
7102 if (outsize != 1) {
7103 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7104 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7105 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7106 Py_DECREF(rep);
7107 goto error;
7108 }
7109 out = PyBytes_AS_STRING(*outbytes) + offset;
7110 }
7111 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7112 out += outsize;
7113 }
7114 else {
7115 Py_ssize_t i;
7116 enum PyUnicode_Kind kind;
7117 void *data;
7118
Benjamin Petersonbac79492012-01-14 13:34:47 -05007119 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 Py_DECREF(rep);
7121 goto error;
7122 }
7123
7124 outsize = PyUnicode_GET_LENGTH(rep);
7125 if (outsize != 1) {
7126 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7127 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7128 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7129 Py_DECREF(rep);
7130 goto error;
7131 }
7132 out = PyBytes_AS_STRING(*outbytes) + offset;
7133 }
7134 kind = PyUnicode_KIND(rep);
7135 data = PyUnicode_DATA(rep);
7136 for (i=0; i < outsize; i++) {
7137 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7138 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007139 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007140 encoding, unicode,
7141 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 "unable to encode error handler result to ASCII");
7143 Py_DECREF(rep);
7144 goto error;
7145 }
7146 *out = (unsigned char)ch;
7147 out++;
7148 }
7149 }
7150 Py_DECREF(rep);
7151 }
7152 /* write a NUL byte */
7153 *out = 0;
7154 outsize = out - PyBytes_AS_STRING(*outbytes);
7155 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7156 if (_PyBytes_Resize(outbytes, outsize) < 0)
7157 goto error;
7158 ret = 0;
7159
7160error:
7161 Py_XDECREF(encoding_obj);
7162 Py_XDECREF(errorHandler);
7163 Py_XDECREF(exc);
7164 return ret;
7165}
7166
Victor Stinner3a50e702011-10-18 21:21:00 +02007167static PyObject *
7168encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007169 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 const char *errors)
7171{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007172 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007174 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007175 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007176
Benjamin Petersonbac79492012-01-14 13:34:47 -05007177 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007178 return NULL;
7179 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007180
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 if (code_page < 0) {
7182 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7183 return NULL;
7184 }
7185
Martin v. Löwis3d325192011-11-04 18:23:06 +01007186 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007187 return PyBytes_FromStringAndSize(NULL, 0);
7188
Victor Stinner7581cef2011-11-03 22:32:33 +01007189 offset = 0;
7190 do
7191 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007192#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007193 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007194 chunks. */
7195 if (len > INT_MAX/2) {
7196 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007197 done = 0;
7198 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007199 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007200#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007201 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007202 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007203 done = 1;
7204 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205
Victor Stinner76a31a62011-11-04 00:05:13 +01007206 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007207 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007208 errors);
7209 if (ret == -2)
7210 ret = encode_code_page_errors(code_page, &outbytes,
7211 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007212 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007213 if (ret < 0) {
7214 Py_XDECREF(outbytes);
7215 return NULL;
7216 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217
Victor Stinner7581cef2011-11-03 22:32:33 +01007218 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007219 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007220 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007221
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 return outbytes;
7223}
7224
7225PyObject *
7226PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7227 Py_ssize_t size,
7228 const char *errors)
7229{
Victor Stinner7581cef2011-11-03 22:32:33 +01007230 PyObject *unicode, *res;
7231 unicode = PyUnicode_FromUnicode(p, size);
7232 if (unicode == NULL)
7233 return NULL;
7234 res = encode_code_page(CP_ACP, unicode, errors);
7235 Py_DECREF(unicode);
7236 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007237}
7238
7239PyObject *
7240PyUnicode_EncodeCodePage(int code_page,
7241 PyObject *unicode,
7242 const char *errors)
7243{
Victor Stinner7581cef2011-11-03 22:32:33 +01007244 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007245}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007246
Alexander Belopolsky40018472011-02-26 01:02:56 +00007247PyObject *
7248PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007249{
7250 if (!PyUnicode_Check(unicode)) {
7251 PyErr_BadArgument();
7252 return NULL;
7253 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007254 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007255}
7256
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007257#undef NEED_RETRY
7258
Victor Stinner99b95382011-07-04 14:23:54 +02007259#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007260
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261/* --- Character Mapping Codec -------------------------------------------- */
7262
Alexander Belopolsky40018472011-02-26 01:02:56 +00007263PyObject *
7264PyUnicode_DecodeCharmap(const char *s,
7265 Py_ssize_t size,
7266 PyObject *mapping,
7267 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007269 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007270 Py_ssize_t startinpos;
7271 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007272 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007273 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007274 PyObject *errorHandler = NULL;
7275 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007276
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 /* Default to Latin-1 */
7278 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007282 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007283 _PyUnicodeWriter_Init(&writer, 0);
7284 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007287 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007288 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007289 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007290 enum PyUnicode_Kind mapkind;
7291 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007292 Py_UCS4 x;
Victor Stinner03c3e352013-04-09 21:53:09 +02007293 unsigned char ch;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007294
Benjamin Petersonbac79492012-01-14 13:34:47 -05007295 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007296 return NULL;
7297
7298 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007299 mapdata = PyUnicode_DATA(mapping);
7300 mapkind = PyUnicode_KIND(mapping);
Victor Stinner03c3e352013-04-09 21:53:09 +02007301
7302 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7303 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7304 * is disabled in encoding aliases, latin1 is preferred because
7305 * its implementation is faster. */
7306 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7307 Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
7308 Py_UCS4 maxchar = writer.maxchar;
7309
7310 assert (writer.kind == PyUnicode_1BYTE_KIND);
7311 while (s < e) {
7312 ch = *s;
7313 x = mapdata_ucs1[ch];
7314 if (x > maxchar) {
7315 if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
7316 goto onError;
7317 maxchar = writer.maxchar;
7318 outdata = (Py_UCS1 *)writer.data;
7319 }
7320 outdata[writer.pos] = x;
7321 writer.pos++;
7322 ++s;
7323 }
7324 }
7325
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007327 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007328 enum PyUnicode_Kind outkind = writer.kind;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007329 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007330 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007331 Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007332 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007333 while (s < e) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007334 ch = *s;
7335 x = mapdata_ucs2[ch];
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007336 if (x > maxchar)
7337 goto Error;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007338 outdata[writer.pos] = x;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007339 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007340 ++s;
7341 }
7342 break;
7343 }
7344 else if (outkind == PyUnicode_2BYTE_KIND) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007345 Py_UCS2 *outdata = (Py_UCS2 *)writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007346 while (s < e) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007347 ch = *s;
7348 x = mapdata_ucs2[ch];
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007349 if (x == 0xFFFE)
7350 goto Error;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007351 outdata[writer.pos] = x;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007352 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007353 ++s;
7354 }
7355 break;
7356 }
7357 }
7358 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007361 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007362 else
7363 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007364Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007365 if (x == 0xfffe)
7366 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 startinpos = s-starts;
7369 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007370 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 errors, &errorHandler,
7372 "charmap", "character maps to <undefined>",
7373 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007374 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 goto onError;
7376 }
7377 continue;
7378 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007379
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02007380 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007381 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007383 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007384 }
7385 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 while (s < e) {
7387 unsigned char ch = *s;
7388 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007389
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7391 w = PyLong_FromLong((long)ch);
7392 if (w == NULL)
7393 goto onError;
7394 x = PyObject_GetItem(mapping, w);
7395 Py_DECREF(w);
7396 if (x == NULL) {
7397 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7398 /* No mapping found means: mapping is undefined. */
7399 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007400 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 } else
7402 goto onError;
7403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007404
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007406 if (x == Py_None)
7407 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 if (PyLong_Check(x)) {
7409 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007410 if (value == 0xFFFE)
7411 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007412 if (value < 0 || value > MAX_UNICODE) {
7413 PyErr_Format(PyExc_TypeError,
7414 "character mapping must be in range(0x%lx)",
7415 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 Py_DECREF(x);
7417 goto onError;
7418 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007419
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02007420 if (_PyUnicodeWriter_WriteCharInline(&writer, value) < 0) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007421 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007422 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007423 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 else if (PyUnicode_Check(x)) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007426 if (PyUnicode_READY(x) == -1) {
7427 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007428 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007429 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007430 if (PyUnicode_GET_LENGTH(x) == 1) {
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007431 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007432 if (value == 0xFFFE)
7433 goto Undefined;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02007434 if (_PyUnicodeWriter_WriteCharInline(&writer, value) < 0) {
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007435 Py_DECREF(x);
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007436 goto onError;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007437 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007438 }
7439 else {
7440 writer.overallocate = 1;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007441 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007442 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007443 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007444 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007445 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 }
7447 else {
7448 /* wrong return value */
7449 PyErr_SetString(PyExc_TypeError,
7450 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007451 Py_DECREF(x);
7452 goto onError;
7453 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 Py_DECREF(x);
7455 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007456 continue;
7457Undefined:
7458 /* undefined mapping */
7459 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007460 startinpos = s-starts;
7461 endinpos = startinpos+1;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007462 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007463 errors, &errorHandler,
7464 "charmap", "character maps to <undefined>",
7465 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007466 &writer)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007467 goto onError;
7468 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007471 Py_XDECREF(errorHandler);
7472 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007473 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007474
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007476 Py_XDECREF(errorHandler);
7477 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007478 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 return NULL;
7480}
7481
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007482/* Charmap encoding: the lookup table */
7483
Alexander Belopolsky40018472011-02-26 01:02:56 +00007484struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 PyObject_HEAD
7486 unsigned char level1[32];
7487 int count2, count3;
7488 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007489};
7490
7491static PyObject*
7492encoding_map_size(PyObject *obj, PyObject* args)
7493{
7494 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007495 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007497}
7498
7499static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007500 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 PyDoc_STR("Return the size (in bytes) of this object") },
7502 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007503};
7504
7505static void
7506encoding_map_dealloc(PyObject* o)
7507{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007508 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007509}
7510
7511static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007512 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 "EncodingMap", /*tp_name*/
7514 sizeof(struct encoding_map), /*tp_basicsize*/
7515 0, /*tp_itemsize*/
7516 /* methods */
7517 encoding_map_dealloc, /*tp_dealloc*/
7518 0, /*tp_print*/
7519 0, /*tp_getattr*/
7520 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007521 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 0, /*tp_repr*/
7523 0, /*tp_as_number*/
7524 0, /*tp_as_sequence*/
7525 0, /*tp_as_mapping*/
7526 0, /*tp_hash*/
7527 0, /*tp_call*/
7528 0, /*tp_str*/
7529 0, /*tp_getattro*/
7530 0, /*tp_setattro*/
7531 0, /*tp_as_buffer*/
7532 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7533 0, /*tp_doc*/
7534 0, /*tp_traverse*/
7535 0, /*tp_clear*/
7536 0, /*tp_richcompare*/
7537 0, /*tp_weaklistoffset*/
7538 0, /*tp_iter*/
7539 0, /*tp_iternext*/
7540 encoding_map_methods, /*tp_methods*/
7541 0, /*tp_members*/
7542 0, /*tp_getset*/
7543 0, /*tp_base*/
7544 0, /*tp_dict*/
7545 0, /*tp_descr_get*/
7546 0, /*tp_descr_set*/
7547 0, /*tp_dictoffset*/
7548 0, /*tp_init*/
7549 0, /*tp_alloc*/
7550 0, /*tp_new*/
7551 0, /*tp_free*/
7552 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007553};
7554
7555PyObject*
7556PyUnicode_BuildEncodingMap(PyObject* string)
7557{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007558 PyObject *result;
7559 struct encoding_map *mresult;
7560 int i;
7561 int need_dict = 0;
7562 unsigned char level1[32];
7563 unsigned char level2[512];
7564 unsigned char *mlevel1, *mlevel2, *mlevel3;
7565 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007566 int kind;
7567 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007568 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007569 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007570
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007571 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007572 PyErr_BadArgument();
7573 return NULL;
7574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007575 kind = PyUnicode_KIND(string);
7576 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007577 length = PyUnicode_GET_LENGTH(string);
7578 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007579 memset(level1, 0xFF, sizeof level1);
7580 memset(level2, 0xFF, sizeof level2);
7581
7582 /* If there isn't a one-to-one mapping of NULL to \0,
7583 or if there are non-BMP characters, we need to use
7584 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007585 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007586 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007587 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007588 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007589 ch = PyUnicode_READ(kind, data, i);
7590 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007591 need_dict = 1;
7592 break;
7593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007594 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007595 /* unmapped character */
7596 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007597 l1 = ch >> 11;
7598 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007599 if (level1[l1] == 0xFF)
7600 level1[l1] = count2++;
7601 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007602 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007603 }
7604
7605 if (count2 >= 0xFF || count3 >= 0xFF)
7606 need_dict = 1;
7607
7608 if (need_dict) {
7609 PyObject *result = PyDict_New();
7610 PyObject *key, *value;
7611 if (!result)
7612 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007613 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007614 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007615 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007616 if (!key || !value)
7617 goto failed1;
7618 if (PyDict_SetItem(result, key, value) == -1)
7619 goto failed1;
7620 Py_DECREF(key);
7621 Py_DECREF(value);
7622 }
7623 return result;
7624 failed1:
7625 Py_XDECREF(key);
7626 Py_XDECREF(value);
7627 Py_DECREF(result);
7628 return NULL;
7629 }
7630
7631 /* Create a three-level trie */
7632 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7633 16*count2 + 128*count3 - 1);
7634 if (!result)
7635 return PyErr_NoMemory();
7636 PyObject_Init(result, &EncodingMapType);
7637 mresult = (struct encoding_map*)result;
7638 mresult->count2 = count2;
7639 mresult->count3 = count3;
7640 mlevel1 = mresult->level1;
7641 mlevel2 = mresult->level23;
7642 mlevel3 = mresult->level23 + 16*count2;
7643 memcpy(mlevel1, level1, 32);
7644 memset(mlevel2, 0xFF, 16*count2);
7645 memset(mlevel3, 0, 128*count3);
7646 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007647 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007648 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007649 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7650 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007651 /* unmapped character */
7652 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007653 o1 = ch>>11;
7654 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007655 i2 = 16*mlevel1[o1] + o2;
7656 if (mlevel2[i2] == 0xFF)
7657 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007658 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007659 i3 = 128*mlevel2[i2] + o3;
7660 mlevel3[i3] = i;
7661 }
7662 return result;
7663}
7664
7665static int
Victor Stinner22168992011-11-20 17:09:18 +01007666encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007667{
7668 struct encoding_map *map = (struct encoding_map*)mapping;
7669 int l1 = c>>11;
7670 int l2 = (c>>7) & 0xF;
7671 int l3 = c & 0x7F;
7672 int i;
7673
Victor Stinner22168992011-11-20 17:09:18 +01007674 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007676 if (c == 0)
7677 return 0;
7678 /* level 1*/
7679 i = map->level1[l1];
7680 if (i == 0xFF) {
7681 return -1;
7682 }
7683 /* level 2*/
7684 i = map->level23[16*i+l2];
7685 if (i == 0xFF) {
7686 return -1;
7687 }
7688 /* level 3 */
7689 i = map->level23[16*map->count2 + 128*i + l3];
7690 if (i == 0) {
7691 return -1;
7692 }
7693 return i;
7694}
7695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007696/* Lookup the character ch in the mapping. If the character
7697 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007698 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007699static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007700charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701{
Christian Heimes217cfd12007-12-02 14:31:20 +00007702 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703 PyObject *x;
7704
7705 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 x = PyObject_GetItem(mapping, w);
7708 Py_DECREF(w);
7709 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7711 /* No mapping found means: mapping is undefined. */
7712 PyErr_Clear();
7713 x = Py_None;
7714 Py_INCREF(x);
7715 return x;
7716 } else
7717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007719 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007721 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 long value = PyLong_AS_LONG(x);
7723 if (value < 0 || value > 255) {
7724 PyErr_SetString(PyExc_TypeError,
7725 "character mapping must be in range(256)");
7726 Py_DECREF(x);
7727 return NULL;
7728 }
7729 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007731 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 /* wrong return value */
7735 PyErr_Format(PyExc_TypeError,
7736 "character mapping must return integer, bytes or None, not %.400s",
7737 x->ob_type->tp_name);
7738 Py_DECREF(x);
7739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 }
7741}
7742
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007744charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007745{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007746 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7747 /* exponentially overallocate to minimize reallocations */
7748 if (requiredsize < 2*outsize)
7749 requiredsize = 2*outsize;
7750 if (_PyBytes_Resize(outobj, requiredsize))
7751 return -1;
7752 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007753}
7754
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007757} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007758/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007759 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007760 space is available. Return a new reference to the object that
7761 was put in the output buffer, or Py_None, if the mapping was undefined
7762 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007763 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007764static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007765charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007766 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007767{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007768 PyObject *rep;
7769 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007770 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007771
Christian Heimes90aa7642007-12-19 02:45:37 +00007772 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775 if (res == -1)
7776 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 if (outsize<requiredsize)
7778 if (charmapencode_resize(outobj, outpos, requiredsize))
7779 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007780 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 outstart[(*outpos)++] = (char)res;
7782 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783 }
7784
7785 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007788 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 Py_DECREF(rep);
7790 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007791 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 if (PyLong_Check(rep)) {
7793 Py_ssize_t requiredsize = *outpos+1;
7794 if (outsize<requiredsize)
7795 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7796 Py_DECREF(rep);
7797 return enc_EXCEPTION;
7798 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007799 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007801 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 else {
7803 const char *repchars = PyBytes_AS_STRING(rep);
7804 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7805 Py_ssize_t requiredsize = *outpos+repsize;
7806 if (outsize<requiredsize)
7807 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7808 Py_DECREF(rep);
7809 return enc_EXCEPTION;
7810 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007811 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 memcpy(outstart + *outpos, repchars, repsize);
7813 *outpos += repsize;
7814 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007816 Py_DECREF(rep);
7817 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007818}
7819
7820/* handle an error in PyUnicode_EncodeCharmap
7821 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007822static int
7823charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007824 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007825 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007826 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007827 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007828{
7829 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007830 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007831 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007832 enum PyUnicode_Kind kind;
7833 void *data;
7834 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007835 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007836 Py_ssize_t collstartpos = *inpos;
7837 Py_ssize_t collendpos = *inpos+1;
7838 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007839 char *encoding = "charmap";
7840 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007842 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007843 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007844
Benjamin Petersonbac79492012-01-14 13:34:47 -05007845 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007846 return -1;
7847 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007848 /* find all unencodable characters */
7849 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007851 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007852 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007853 val = encoding_map_lookup(ch, mapping);
7854 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 break;
7856 ++collendpos;
7857 continue;
7858 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007860 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7861 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 if (rep==NULL)
7863 return -1;
7864 else if (rep!=Py_None) {
7865 Py_DECREF(rep);
7866 break;
7867 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007868 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870 }
7871 /* cache callback name lookup
7872 * (if not done yet, i.e. it's the first error) */
7873 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 if ((errors==NULL) || (!strcmp(errors, "strict")))
7875 *known_errorHandler = 1;
7876 else if (!strcmp(errors, "replace"))
7877 *known_errorHandler = 2;
7878 else if (!strcmp(errors, "ignore"))
7879 *known_errorHandler = 3;
7880 else if (!strcmp(errors, "xmlcharrefreplace"))
7881 *known_errorHandler = 4;
7882 else
7883 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007884 }
7885 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007886 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007887 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 return -1;
7889 case 2: /* replace */
7890 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 x = charmapencode_output('?', mapping, res, respos);
7892 if (x==enc_EXCEPTION) {
7893 return -1;
7894 }
7895 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007896 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 return -1;
7898 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007899 }
7900 /* fall through */
7901 case 3: /* ignore */
7902 *inpos = collendpos;
7903 break;
7904 case 4: /* xmlcharrefreplace */
7905 /* generate replacement (temporarily (mis)uses p) */
7906 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 char buffer[2+29+1+1];
7908 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007909 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 for (cp = buffer; *cp; ++cp) {
7911 x = charmapencode_output(*cp, mapping, res, respos);
7912 if (x==enc_EXCEPTION)
7913 return -1;
7914 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007915 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 return -1;
7917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 }
7919 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920 *inpos = collendpos;
7921 break;
7922 default:
7923 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007924 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007926 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007928 if (PyBytes_Check(repunicode)) {
7929 /* Directly copy bytes result to output. */
7930 Py_ssize_t outsize = PyBytes_Size(*res);
7931 Py_ssize_t requiredsize;
7932 repsize = PyBytes_Size(repunicode);
7933 requiredsize = *respos + repsize;
7934 if (requiredsize > outsize)
7935 /* Make room for all additional bytes. */
7936 if (charmapencode_resize(res, respos, requiredsize)) {
7937 Py_DECREF(repunicode);
7938 return -1;
7939 }
7940 memcpy(PyBytes_AsString(*res) + *respos,
7941 PyBytes_AsString(repunicode), repsize);
7942 *respos += repsize;
7943 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007944 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007945 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007946 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007947 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007948 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007949 Py_DECREF(repunicode);
7950 return -1;
7951 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007952 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007953 data = PyUnicode_DATA(repunicode);
7954 kind = PyUnicode_KIND(repunicode);
7955 for (index = 0; index < repsize; index++) {
7956 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7957 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007959 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 return -1;
7961 }
7962 else if (x==enc_FAILED) {
7963 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007964 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 return -1;
7966 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007967 }
7968 *inpos = newpos;
7969 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970 }
7971 return 0;
7972}
7973
Alexander Belopolsky40018472011-02-26 01:02:56 +00007974PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007975_PyUnicode_EncodeCharmap(PyObject *unicode,
7976 PyObject *mapping,
7977 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979 /* output object */
7980 PyObject *res = NULL;
7981 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007982 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007983 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007985 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986 PyObject *errorHandler = NULL;
7987 PyObject *exc = NULL;
7988 /* the following variable is used for caching string comparisons
7989 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7990 * 3=ignore, 4=xmlcharrefreplace */
7991 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02007992 void *data;
7993 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994
Benjamin Petersonbac79492012-01-14 13:34:47 -05007995 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007996 return NULL;
7997 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02007998 data = PyUnicode_DATA(unicode);
7999 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008000
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 /* Default to Latin-1 */
8002 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008003 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005 /* allocate enough for a simple encoding without
8006 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008007 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008 if (res == NULL)
8009 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008010 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008014 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008016 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 if (x==enc_EXCEPTION) /* error */
8018 goto onError;
8019 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008020 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 &exc,
8022 &known_errorHandler, &errorHandler, errors,
8023 &res, &respos)) {
8024 goto onError;
8025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008026 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 else
8028 /* done with this character => adjust input position */
8029 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008032 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008033 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008034 if (_PyBytes_Resize(&res, respos) < 0)
8035 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008036
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008037 Py_XDECREF(exc);
8038 Py_XDECREF(errorHandler);
8039 return res;
8040
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008042 Py_XDECREF(res);
8043 Py_XDECREF(exc);
8044 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 return NULL;
8046}
8047
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008048/* Deprecated */
8049PyObject *
8050PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8051 Py_ssize_t size,
8052 PyObject *mapping,
8053 const char *errors)
8054{
8055 PyObject *result;
8056 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8057 if (unicode == NULL)
8058 return NULL;
8059 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8060 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008061 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008062}
8063
Alexander Belopolsky40018472011-02-26 01:02:56 +00008064PyObject *
8065PyUnicode_AsCharmapString(PyObject *unicode,
8066 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067{
8068 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 PyErr_BadArgument();
8070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008072 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073}
8074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008076static void
8077make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008078 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079 Py_ssize_t startpos, Py_ssize_t endpos,
8080 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008083 *exceptionObject = _PyUnicodeTranslateError_Create(
8084 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 }
8086 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8088 goto onError;
8089 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8090 goto onError;
8091 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8092 goto onError;
8093 return;
8094 onError:
8095 Py_DECREF(*exceptionObject);
8096 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 }
8098}
8099
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100/* error handling callback helper:
8101 build arguments, call the callback and check the arguments,
8102 put the result into newpos and return the replacement string, which
8103 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008104static PyObject *
8105unicode_translate_call_errorhandler(const char *errors,
8106 PyObject **errorHandler,
8107 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008109 Py_ssize_t startpos, Py_ssize_t endpos,
8110 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008112 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008114 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008115 PyObject *restuple;
8116 PyObject *resunicode;
8117
8118 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122 }
8123
8124 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008125 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008126 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128
8129 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008134 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 Py_DECREF(restuple);
8136 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008137 }
8138 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 &resunicode, &i_newpos)) {
8140 Py_DECREF(restuple);
8141 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008145 else
8146 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008147 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8149 Py_DECREF(restuple);
8150 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152 Py_INCREF(resunicode);
8153 Py_DECREF(restuple);
8154 return resunicode;
8155}
8156
8157/* Lookup the character ch in the mapping and put the result in result,
8158 which must be decrefed by the caller.
8159 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008160static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162{
Christian Heimes217cfd12007-12-02 14:31:20 +00008163 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008164 PyObject *x;
8165
8166 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008168 x = PyObject_GetItem(mapping, w);
8169 Py_DECREF(w);
8170 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8172 /* No mapping found means: use 1:1 mapping. */
8173 PyErr_Clear();
8174 *result = NULL;
8175 return 0;
8176 } else
8177 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008178 }
8179 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 *result = x;
8181 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008182 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008183 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 long value = PyLong_AS_LONG(x);
8185 long max = PyUnicode_GetMax();
8186 if (value < 0 || value > max) {
8187 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008188 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 Py_DECREF(x);
8190 return -1;
8191 }
8192 *result = x;
8193 return 0;
8194 }
8195 else if (PyUnicode_Check(x)) {
8196 *result = x;
8197 return 0;
8198 }
8199 else {
8200 /* wrong return value */
8201 PyErr_SetString(PyExc_TypeError,
8202 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 Py_DECREF(x);
8204 return -1;
8205 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008206}
8207/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 if not reallocate and adjust various state variables.
8209 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008210static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008214 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008215 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008216 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 /* exponentially overallocate to minimize reallocations */
8218 if (requiredsize < 2 * oldsize)
8219 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008220 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8221 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008223 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008224 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225 }
8226 return 0;
8227}
8228/* lookup the character, put the result in the output string and adjust
8229 various state variables. Return a new reference to the object that
8230 was put in the output buffer in *result, or Py_None, if the mapping was
8231 undefined (in which case no character was written).
8232 The called must decref result.
8233 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008234static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8236 PyObject *mapping, Py_UCS4 **output,
8237 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008238 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8241 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008245 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008246 }
8247 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008249 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 }
8253 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 Py_ssize_t repsize;
8255 if (PyUnicode_READY(*res) == -1)
8256 return -1;
8257 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 if (repsize==1) {
8259 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 }
8262 else if (repsize!=0) {
8263 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008264 Py_ssize_t requiredsize = *opos +
8265 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008267 Py_ssize_t i;
8268 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008270 for(i = 0; i < repsize; i++)
8271 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 }
8274 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 return 0;
8277}
8278
Alexander Belopolsky40018472011-02-26 01:02:56 +00008279PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008280_PyUnicode_TranslateCharmap(PyObject *input,
8281 PyObject *mapping,
8282 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284 /* input object */
8285 char *idata;
8286 Py_ssize_t size, i;
8287 int kind;
8288 /* output buffer */
8289 Py_UCS4 *output = NULL;
8290 Py_ssize_t osize;
8291 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 char *reason = "character maps to <undefined>";
8295 PyObject *errorHandler = NULL;
8296 PyObject *exc = NULL;
8297 /* the following variable is used for caching string comparisons
8298 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8299 * 3=ignore, 4=xmlcharrefreplace */
8300 int known_errorHandler = -1;
8301
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 PyErr_BadArgument();
8304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 if (PyUnicode_READY(input) == -1)
8308 return NULL;
8309 idata = (char*)PyUnicode_DATA(input);
8310 kind = PyUnicode_KIND(input);
8311 size = PyUnicode_GET_LENGTH(input);
8312 i = 0;
8313
8314 if (size == 0) {
8315 Py_INCREF(input);
8316 return input;
8317 }
8318
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 /* allocate enough for a simple 1:1 translation without
8320 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008321 osize = size;
8322 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8323 opos = 0;
8324 if (output == NULL) {
8325 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 /* try to encode it */
8331 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 if (charmaptranslate_output(input, i, mapping,
8333 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 Py_XDECREF(x);
8335 goto onError;
8336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008337 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 else { /* untranslatable character */
8341 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8342 Py_ssize_t repsize;
8343 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 Py_ssize_t collstart = i;
8347 Py_ssize_t collend = i+1;
8348 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 while (collend < size) {
8352 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 goto onError;
8354 Py_XDECREF(x);
8355 if (x!=Py_None)
8356 break;
8357 ++collend;
8358 }
8359 /* cache callback name lookup
8360 * (if not done yet, i.e. it's the first error) */
8361 if (known_errorHandler==-1) {
8362 if ((errors==NULL) || (!strcmp(errors, "strict")))
8363 known_errorHandler = 1;
8364 else if (!strcmp(errors, "replace"))
8365 known_errorHandler = 2;
8366 else if (!strcmp(errors, "ignore"))
8367 known_errorHandler = 3;
8368 else if (!strcmp(errors, "xmlcharrefreplace"))
8369 known_errorHandler = 4;
8370 else
8371 known_errorHandler = 0;
8372 }
8373 switch (known_errorHandler) {
8374 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008375 make_translate_exception(&exc,
8376 input, collstart, collend, reason);
8377 if (exc != NULL)
8378 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008379 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 case 2: /* replace */
8381 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 for (coll = collstart; coll<collend; coll++)
8383 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 /* fall through */
8385 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 break;
8388 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389 /* generate replacement (temporarily (mis)uses i) */
8390 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 char buffer[2+29+1+1];
8392 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8394 if (charmaptranslate_makespace(&output, &osize,
8395 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 goto onError;
8397 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 break;
8402 default:
8403 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 reason, input, &exc,
8405 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008406 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008408 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008409 Py_DECREF(repunicode);
8410 goto onError;
8411 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 repsize = PyUnicode_GET_LENGTH(repunicode);
8414 if (charmaptranslate_makespace(&output, &osize,
8415 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 Py_DECREF(repunicode);
8417 goto onError;
8418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419 for (uni2 = 0; repsize-->0; ++uni2)
8420 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8421 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 }
8425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8427 if (!res)
8428 goto onError;
8429 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 Py_XDECREF(exc);
8431 Py_XDECREF(errorHandler);
8432 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 Py_XDECREF(exc);
8437 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 return NULL;
8439}
8440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441/* Deprecated. Use PyUnicode_Translate instead. */
8442PyObject *
8443PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8444 Py_ssize_t size,
8445 PyObject *mapping,
8446 const char *errors)
8447{
Christian Heimes5f520f42012-09-11 14:03:25 +02008448 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8450 if (!unicode)
8451 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008452 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8453 Py_DECREF(unicode);
8454 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455}
8456
Alexander Belopolsky40018472011-02-26 01:02:56 +00008457PyObject *
8458PyUnicode_Translate(PyObject *str,
8459 PyObject *mapping,
8460 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461{
8462 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008463
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 str = PyUnicode_FromObject(str);
8465 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008466 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 Py_DECREF(str);
8469 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470}
Tim Petersced69f82003-09-16 20:30:58 +00008471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008473fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474{
8475 /* No need to call PyUnicode_READY(self) because this function is only
8476 called as a callback from fixup() which does it already. */
8477 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8478 const int kind = PyUnicode_KIND(self);
8479 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008480 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008481 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 Py_ssize_t i;
8483
8484 for (i = 0; i < len; ++i) {
8485 ch = PyUnicode_READ(kind, data, i);
8486 fixed = 0;
8487 if (ch > 127) {
8488 if (Py_UNICODE_ISSPACE(ch))
8489 fixed = ' ';
8490 else {
8491 const int decimal = Py_UNICODE_TODECIMAL(ch);
8492 if (decimal >= 0)
8493 fixed = '0' + decimal;
8494 }
8495 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008496 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008497 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 PyUnicode_WRITE(kind, data, i, fixed);
8499 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008500 else
8501 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 }
8504
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008505 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506}
8507
8508PyObject *
8509_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8510{
8511 if (!PyUnicode_Check(unicode)) {
8512 PyErr_BadInternalCall();
8513 return NULL;
8514 }
8515 if (PyUnicode_READY(unicode) == -1)
8516 return NULL;
8517 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8518 /* If the string is already ASCII, just return the same string */
8519 Py_INCREF(unicode);
8520 return unicode;
8521 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008522 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523}
8524
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008525PyObject *
8526PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8527 Py_ssize_t length)
8528{
Victor Stinnerf0124502011-11-21 23:12:56 +01008529 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008530 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008531 Py_UCS4 maxchar;
8532 enum PyUnicode_Kind kind;
8533 void *data;
8534
Victor Stinner99d7ad02012-02-22 13:37:39 +01008535 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008536 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008537 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008538 if (ch > 127) {
8539 int decimal = Py_UNICODE_TODECIMAL(ch);
8540 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008541 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008542 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008543 }
8544 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008545
8546 /* Copy to a new string */
8547 decimal = PyUnicode_New(length, maxchar);
8548 if (decimal == NULL)
8549 return decimal;
8550 kind = PyUnicode_KIND(decimal);
8551 data = PyUnicode_DATA(decimal);
8552 /* Iterate over code points */
8553 for (i = 0; i < length; i++) {
8554 Py_UNICODE ch = s[i];
8555 if (ch > 127) {
8556 int decimal = Py_UNICODE_TODECIMAL(ch);
8557 if (decimal >= 0)
8558 ch = '0' + decimal;
8559 }
8560 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008562 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008563}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008564/* --- Decimal Encoder ---------------------------------------------------- */
8565
Alexander Belopolsky40018472011-02-26 01:02:56 +00008566int
8567PyUnicode_EncodeDecimal(Py_UNICODE *s,
8568 Py_ssize_t length,
8569 char *output,
8570 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008571{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008573 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008574 enum PyUnicode_Kind kind;
8575 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008576
8577 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 PyErr_BadArgument();
8579 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008580 }
8581
Victor Stinner42bf7752011-11-21 22:52:58 +01008582 unicode = PyUnicode_FromUnicode(s, length);
8583 if (unicode == NULL)
8584 return -1;
8585
Benjamin Petersonbac79492012-01-14 13:34:47 -05008586 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008587 Py_DECREF(unicode);
8588 return -1;
8589 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008590 kind = PyUnicode_KIND(unicode);
8591 data = PyUnicode_DATA(unicode);
8592
Victor Stinnerb84d7232011-11-22 01:50:07 +01008593 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008594 PyObject *exc;
8595 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008597 Py_ssize_t startpos;
8598
8599 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008600
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008602 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008603 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008605 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 decimal = Py_UNICODE_TODECIMAL(ch);
8607 if (decimal >= 0) {
8608 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008609 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 continue;
8611 }
8612 if (0 < ch && ch < 256) {
8613 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008614 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 continue;
8616 }
Victor Stinner6345be92011-11-25 20:09:01 +01008617
Victor Stinner42bf7752011-11-21 22:52:58 +01008618 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008619 exc = NULL;
8620 raise_encode_exception(&exc, "decimal", unicode,
8621 startpos, startpos+1,
8622 "invalid decimal Unicode string");
8623 Py_XDECREF(exc);
8624 Py_DECREF(unicode);
8625 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008626 }
8627 /* 0-terminate the output string */
8628 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008629 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008630 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008631}
8632
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633/* --- Helpers ------------------------------------------------------------ */
8634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008636any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 Py_ssize_t start,
8638 Py_ssize_t end)
8639{
8640 int kind1, kind2, kind;
8641 void *buf1, *buf2;
8642 Py_ssize_t len1, len2, result;
8643
8644 kind1 = PyUnicode_KIND(s1);
8645 kind2 = PyUnicode_KIND(s2);
8646 kind = kind1 > kind2 ? kind1 : kind2;
8647 buf1 = PyUnicode_DATA(s1);
8648 buf2 = PyUnicode_DATA(s2);
8649 if (kind1 != kind)
8650 buf1 = _PyUnicode_AsKind(s1, kind);
8651 if (!buf1)
8652 return -2;
8653 if (kind2 != kind)
8654 buf2 = _PyUnicode_AsKind(s2, kind);
8655 if (!buf2) {
8656 if (kind1 != kind) PyMem_Free(buf1);
8657 return -2;
8658 }
8659 len1 = PyUnicode_GET_LENGTH(s1);
8660 len2 = PyUnicode_GET_LENGTH(s2);
8661
Victor Stinner794d5672011-10-10 03:21:36 +02008662 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008663 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008664 case PyUnicode_1BYTE_KIND:
8665 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8666 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8667 else
8668 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8669 break;
8670 case PyUnicode_2BYTE_KIND:
8671 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8672 break;
8673 case PyUnicode_4BYTE_KIND:
8674 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8675 break;
8676 default:
8677 assert(0); result = -2;
8678 }
8679 }
8680 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008681 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008682 case PyUnicode_1BYTE_KIND:
8683 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8684 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8685 else
8686 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8687 break;
8688 case PyUnicode_2BYTE_KIND:
8689 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8690 break;
8691 case PyUnicode_4BYTE_KIND:
8692 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8693 break;
8694 default:
8695 assert(0); result = -2;
8696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 }
8698
8699 if (kind1 != kind)
8700 PyMem_Free(buf1);
8701 if (kind2 != kind)
8702 PyMem_Free(buf2);
8703
8704 return result;
8705}
8706
8707Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008708_PyUnicode_InsertThousandsGrouping(
8709 PyObject *unicode, Py_ssize_t index,
8710 Py_ssize_t n_buffer,
8711 void *digits, Py_ssize_t n_digits,
8712 Py_ssize_t min_width,
8713 const char *grouping, PyObject *thousands_sep,
8714 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715{
Victor Stinner41a863c2012-02-24 00:37:51 +01008716 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008717 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008718 Py_ssize_t thousands_sep_len;
8719 Py_ssize_t len;
8720
8721 if (unicode != NULL) {
8722 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008723 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008724 }
8725 else {
8726 kind = PyUnicode_1BYTE_KIND;
8727 data = NULL;
8728 }
8729 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8730 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8731 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8732 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008733 if (thousands_sep_kind < kind) {
8734 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8735 if (!thousands_sep_data)
8736 return -1;
8737 }
8738 else {
8739 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8740 if (!data)
8741 return -1;
8742 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008743 }
8744
Benjamin Petersonead6b532011-12-20 17:23:42 -06008745 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008747 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008748 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008749 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008750 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008751 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008752 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008753 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008754 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008755 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008756 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008757 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008759 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008760 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008761 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008762 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008763 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008765 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008766 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008767 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008768 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008769 break;
8770 default:
8771 assert(0);
8772 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008774 if (unicode != NULL && thousands_sep_kind != kind) {
8775 if (thousands_sep_kind < kind)
8776 PyMem_Free(thousands_sep_data);
8777 else
8778 PyMem_Free(data);
8779 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008780 if (unicode == NULL) {
8781 *maxchar = 127;
8782 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008783 *maxchar = MAX_MAXCHAR(*maxchar,
8784 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008785 }
8786 }
8787 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788}
8789
8790
Thomas Wouters477c8d52006-05-27 19:21:47 +00008791/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008792#define ADJUST_INDICES(start, end, len) \
8793 if (end > len) \
8794 end = len; \
8795 else if (end < 0) { \
8796 end += len; \
8797 if (end < 0) \
8798 end = 0; \
8799 } \
8800 if (start < 0) { \
8801 start += len; \
8802 if (start < 0) \
8803 start = 0; \
8804 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008805
Alexander Belopolsky40018472011-02-26 01:02:56 +00008806Py_ssize_t
8807PyUnicode_Count(PyObject *str,
8808 PyObject *substr,
8809 Py_ssize_t start,
8810 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008812 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008813 PyObject* str_obj;
8814 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 int kind1, kind2, kind;
8816 void *buf1 = NULL, *buf2 = NULL;
8817 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008818
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008819 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008820 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008822 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008823 if (!sub_obj) {
8824 Py_DECREF(str_obj);
8825 return -1;
8826 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008827 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008828 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 Py_DECREF(str_obj);
8830 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 }
Tim Petersced69f82003-09-16 20:30:58 +00008832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 kind1 = PyUnicode_KIND(str_obj);
8834 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008835 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008838 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008839 if (kind2 > kind) {
8840 Py_DECREF(sub_obj);
8841 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008842 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008843 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008844 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 if (!buf2)
8847 goto onError;
8848 len1 = PyUnicode_GET_LENGTH(str_obj);
8849 len2 = PyUnicode_GET_LENGTH(sub_obj);
8850
8851 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008852 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008854 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8855 result = asciilib_count(
8856 ((Py_UCS1*)buf1) + start, end - start,
8857 buf2, len2, PY_SSIZE_T_MAX
8858 );
8859 else
8860 result = ucs1lib_count(
8861 ((Py_UCS1*)buf1) + start, end - start,
8862 buf2, len2, PY_SSIZE_T_MAX
8863 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 break;
8865 case PyUnicode_2BYTE_KIND:
8866 result = ucs2lib_count(
8867 ((Py_UCS2*)buf1) + start, end - start,
8868 buf2, len2, PY_SSIZE_T_MAX
8869 );
8870 break;
8871 case PyUnicode_4BYTE_KIND:
8872 result = ucs4lib_count(
8873 ((Py_UCS4*)buf1) + start, end - start,
8874 buf2, len2, PY_SSIZE_T_MAX
8875 );
8876 break;
8877 default:
8878 assert(0); result = 0;
8879 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008880
8881 Py_DECREF(sub_obj);
8882 Py_DECREF(str_obj);
8883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 if (kind2 != kind)
8885 PyMem_Free(buf2);
8886
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 onError:
8889 Py_DECREF(sub_obj);
8890 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 if (kind2 != kind && buf2)
8892 PyMem_Free(buf2);
8893 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894}
8895
Alexander Belopolsky40018472011-02-26 01:02:56 +00008896Py_ssize_t
8897PyUnicode_Find(PyObject *str,
8898 PyObject *sub,
8899 Py_ssize_t start,
8900 Py_ssize_t end,
8901 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008903 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008904
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008906 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008908 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008909 if (!sub) {
8910 Py_DECREF(str);
8911 return -2;
8912 }
8913 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8914 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 Py_DECREF(str);
8916 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917 }
Tim Petersced69f82003-09-16 20:30:58 +00008918
Victor Stinner794d5672011-10-10 03:21:36 +02008919 result = any_find_slice(direction,
8920 str, sub, start, end
8921 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008922
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008924 Py_DECREF(sub);
8925
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926 return result;
8927}
8928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929Py_ssize_t
8930PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8931 Py_ssize_t start, Py_ssize_t end,
8932 int direction)
8933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008935 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 if (PyUnicode_READY(str) == -1)
8937 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008938 if (start < 0 || end < 0) {
8939 PyErr_SetString(PyExc_IndexError, "string index out of range");
8940 return -2;
8941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 if (end > PyUnicode_GET_LENGTH(str))
8943 end = PyUnicode_GET_LENGTH(str);
8944 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008945 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8946 kind, end-start, ch, direction);
8947 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008949 else
8950 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951}
8952
Alexander Belopolsky40018472011-02-26 01:02:56 +00008953static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008954tailmatch(PyObject *self,
8955 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008956 Py_ssize_t start,
8957 Py_ssize_t end,
8958 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 int kind_self;
8961 int kind_sub;
8962 void *data_self;
8963 void *data_sub;
8964 Py_ssize_t offset;
8965 Py_ssize_t i;
8966 Py_ssize_t end_sub;
8967
8968 if (PyUnicode_READY(self) == -1 ||
8969 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01008970 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971
8972 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 return 1;
8974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8976 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 kind_self = PyUnicode_KIND(self);
8981 data_self = PyUnicode_DATA(self);
8982 kind_sub = PyUnicode_KIND(substring);
8983 data_sub = PyUnicode_DATA(substring);
8984 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8985
8986 if (direction > 0)
8987 offset = end;
8988 else
8989 offset = start;
8990
8991 if (PyUnicode_READ(kind_self, data_self, offset) ==
8992 PyUnicode_READ(kind_sub, data_sub, 0) &&
8993 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8994 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8995 /* If both are of the same kind, memcmp is sufficient */
8996 if (kind_self == kind_sub) {
8997 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008998 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 data_sub,
9000 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009001 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 }
9003 /* otherwise we have to compare each character by first accesing it */
9004 else {
9005 /* We do not need to compare 0 and len(substring)-1 because
9006 the if statement above ensured already that they are equal
9007 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 for (i = 1; i < end_sub; ++i) {
9009 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9010 PyUnicode_READ(kind_sub, data_sub, i))
9011 return 0;
9012 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 }
9016
9017 return 0;
9018}
9019
Alexander Belopolsky40018472011-02-26 01:02:56 +00009020Py_ssize_t
9021PyUnicode_Tailmatch(PyObject *str,
9022 PyObject *substr,
9023 Py_ssize_t start,
9024 Py_ssize_t end,
9025 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009027 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009028
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029 str = PyUnicode_FromObject(str);
9030 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 substr = PyUnicode_FromObject(substr);
9033 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 Py_DECREF(str);
9035 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 }
Tim Petersced69f82003-09-16 20:30:58 +00009037
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009038 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009039 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 Py_DECREF(str);
9041 Py_DECREF(substr);
9042 return result;
9043}
9044
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045/* Apply fixfct filter to the Unicode object self and return a
9046 reference to the modified object */
9047
Alexander Belopolsky40018472011-02-26 01:02:56 +00009048static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009049fixup(PyObject *self,
9050 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 PyObject *u;
9053 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009054 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009056 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009058 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009059 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 /* fix functions return the new maximum character in a string,
9062 if the kind of the resulting unicode object does not change,
9063 everything is fine. Otherwise we need to change the string kind
9064 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009065 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009066
9067 if (maxchar_new == 0) {
9068 /* no changes */;
9069 if (PyUnicode_CheckExact(self)) {
9070 Py_DECREF(u);
9071 Py_INCREF(self);
9072 return self;
9073 }
9074 else
9075 return u;
9076 }
9077
Victor Stinnere6abb482012-05-02 01:15:40 +02009078 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079
Victor Stinnereaab6042011-12-11 22:22:39 +01009080 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009082
9083 /* In case the maximum character changed, we need to
9084 convert the string to the new category. */
9085 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9086 if (v == NULL) {
9087 Py_DECREF(u);
9088 return NULL;
9089 }
9090 if (maxchar_new > maxchar_old) {
9091 /* If the maxchar increased so that the kind changed, not all
9092 characters are representable anymore and we need to fix the
9093 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009094 _PyUnicode_FastCopyCharacters(v, 0,
9095 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009096 maxchar_old = fixfct(v);
9097 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 }
9099 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009100 _PyUnicode_FastCopyCharacters(v, 0,
9101 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009103 Py_DECREF(u);
9104 assert(_PyUnicode_CheckConsistency(v, 1));
9105 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106}
9107
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009108static PyObject *
9109ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009111 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9112 char *resdata, *data = PyUnicode_DATA(self);
9113 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009114
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009115 res = PyUnicode_New(len, 127);
9116 if (res == NULL)
9117 return NULL;
9118 resdata = PyUnicode_DATA(res);
9119 if (lower)
9120 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009122 _Py_bytes_upper(resdata, data, len);
9123 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124}
9125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009127handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009129 Py_ssize_t j;
9130 int final_sigma;
9131 Py_UCS4 c;
9132 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009133
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009134 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9135
9136 where ! is a negation and \p{xxx} is a character with property xxx.
9137 */
9138 for (j = i - 1; j >= 0; j--) {
9139 c = PyUnicode_READ(kind, data, j);
9140 if (!_PyUnicode_IsCaseIgnorable(c))
9141 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009143 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9144 if (final_sigma) {
9145 for (j = i + 1; j < length; j++) {
9146 c = PyUnicode_READ(kind, data, j);
9147 if (!_PyUnicode_IsCaseIgnorable(c))
9148 break;
9149 }
9150 final_sigma = j == length || !_PyUnicode_IsCased(c);
9151 }
9152 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153}
9154
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009155static int
9156lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9157 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009159 /* Obscure special case. */
9160 if (c == 0x3A3) {
9161 mapped[0] = handle_capital_sigma(kind, data, length, i);
9162 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009164 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165}
9166
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009167static Py_ssize_t
9168do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009170 Py_ssize_t i, k = 0;
9171 int n_res, j;
9172 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009173
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009174 c = PyUnicode_READ(kind, data, 0);
9175 n_res = _PyUnicode_ToUpperFull(c, mapped);
9176 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009177 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009178 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009180 for (i = 1; i < length; i++) {
9181 c = PyUnicode_READ(kind, data, i);
9182 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9183 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009184 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009185 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009186 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009187 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009188 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189}
9190
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009191static Py_ssize_t
9192do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9193 Py_ssize_t i, k = 0;
9194
9195 for (i = 0; i < length; i++) {
9196 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9197 int n_res, j;
9198 if (Py_UNICODE_ISUPPER(c)) {
9199 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9200 }
9201 else if (Py_UNICODE_ISLOWER(c)) {
9202 n_res = _PyUnicode_ToUpperFull(c, mapped);
9203 }
9204 else {
9205 n_res = 1;
9206 mapped[0] = c;
9207 }
9208 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009209 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009210 res[k++] = mapped[j];
9211 }
9212 }
9213 return k;
9214}
9215
9216static Py_ssize_t
9217do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9218 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009220 Py_ssize_t i, k = 0;
9221
9222 for (i = 0; i < length; i++) {
9223 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9224 int n_res, j;
9225 if (lower)
9226 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9227 else
9228 n_res = _PyUnicode_ToUpperFull(c, mapped);
9229 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009230 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009231 res[k++] = mapped[j];
9232 }
9233 }
9234 return k;
9235}
9236
9237static Py_ssize_t
9238do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9239{
9240 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9241}
9242
9243static Py_ssize_t
9244do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9245{
9246 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9247}
9248
Benjamin Petersone51757f2012-01-12 21:10:29 -05009249static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009250do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9251{
9252 Py_ssize_t i, k = 0;
9253
9254 for (i = 0; i < length; i++) {
9255 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9256 Py_UCS4 mapped[3];
9257 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9258 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009259 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009260 res[k++] = mapped[j];
9261 }
9262 }
9263 return k;
9264}
9265
9266static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009267do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9268{
9269 Py_ssize_t i, k = 0;
9270 int previous_is_cased;
9271
9272 previous_is_cased = 0;
9273 for (i = 0; i < length; i++) {
9274 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9275 Py_UCS4 mapped[3];
9276 int n_res, j;
9277
9278 if (previous_is_cased)
9279 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9280 else
9281 n_res = _PyUnicode_ToTitleFull(c, mapped);
9282
9283 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009284 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009285 res[k++] = mapped[j];
9286 }
9287
9288 previous_is_cased = _PyUnicode_IsCased(c);
9289 }
9290 return k;
9291}
9292
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009293static PyObject *
9294case_operation(PyObject *self,
9295 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9296{
9297 PyObject *res = NULL;
9298 Py_ssize_t length, newlength = 0;
9299 int kind, outkind;
9300 void *data, *outdata;
9301 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9302
Benjamin Petersoneea48462012-01-16 14:28:50 -05009303 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009304
9305 kind = PyUnicode_KIND(self);
9306 data = PyUnicode_DATA(self);
9307 length = PyUnicode_GET_LENGTH(self);
9308 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9309 if (tmp == NULL)
9310 return PyErr_NoMemory();
9311 newlength = perform(kind, data, length, tmp, &maxchar);
9312 res = PyUnicode_New(newlength, maxchar);
9313 if (res == NULL)
9314 goto leave;
9315 tmpend = tmp + newlength;
9316 outdata = PyUnicode_DATA(res);
9317 outkind = PyUnicode_KIND(res);
9318 switch (outkind) {
9319 case PyUnicode_1BYTE_KIND:
9320 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9321 break;
9322 case PyUnicode_2BYTE_KIND:
9323 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9324 break;
9325 case PyUnicode_4BYTE_KIND:
9326 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9327 break;
9328 default:
9329 assert(0);
9330 break;
9331 }
9332 leave:
9333 PyMem_FREE(tmp);
9334 return res;
9335}
9336
Tim Peters8ce9f162004-08-27 01:49:32 +00009337PyObject *
9338PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009341 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009343 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009344 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9345 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009346 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009348 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009350 int use_memcpy;
9351 unsigned char *res_data = NULL, *sep_data = NULL;
9352 PyObject *last_obj;
9353 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354
Tim Peters05eba1f2004-08-27 21:32:02 +00009355 fseq = PySequence_Fast(seq, "");
9356 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009357 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009358 }
9359
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009360 /* NOTE: the following code can't call back into Python code,
9361 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009362 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009363
Tim Peters05eba1f2004-08-27 21:32:02 +00009364 seqlen = PySequence_Fast_GET_SIZE(fseq);
9365 /* If empty sequence, return u"". */
9366 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009367 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009368 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009370
Tim Peters05eba1f2004-08-27 21:32:02 +00009371 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009372 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009373 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009374 if (seqlen == 1) {
9375 if (PyUnicode_CheckExact(items[0])) {
9376 res = items[0];
9377 Py_INCREF(res);
9378 Py_DECREF(fseq);
9379 return res;
9380 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009381 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009382 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009383 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009384 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009385 /* Set up sep and seplen */
9386 if (separator == NULL) {
9387 /* fall back to a blank space separator */
9388 sep = PyUnicode_FromOrdinal(' ');
9389 if (!sep)
9390 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009391 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009392 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009393 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009394 else {
9395 if (!PyUnicode_Check(separator)) {
9396 PyErr_Format(PyExc_TypeError,
9397 "separator: expected str instance,"
9398 " %.80s found",
9399 Py_TYPE(separator)->tp_name);
9400 goto onError;
9401 }
9402 if (PyUnicode_READY(separator))
9403 goto onError;
9404 sep = separator;
9405 seplen = PyUnicode_GET_LENGTH(separator);
9406 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9407 /* inc refcount to keep this code path symmetric with the
9408 above case of a blank separator */
9409 Py_INCREF(sep);
9410 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009411 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009412 }
9413
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009414 /* There are at least two things to join, or else we have a subclass
9415 * of str in the sequence.
9416 * Do a pre-pass to figure out the total amount of space we'll
9417 * need (sz), and see whether all argument are strings.
9418 */
9419 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009420#ifdef Py_DEBUG
9421 use_memcpy = 0;
9422#else
9423 use_memcpy = 1;
9424#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009425 for (i = 0; i < seqlen; i++) {
9426 const Py_ssize_t old_sz = sz;
9427 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009428 if (!PyUnicode_Check(item)) {
9429 PyErr_Format(PyExc_TypeError,
9430 "sequence item %zd: expected str instance,"
9431 " %.80s found",
9432 i, Py_TYPE(item)->tp_name);
9433 goto onError;
9434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 if (PyUnicode_READY(item) == -1)
9436 goto onError;
9437 sz += PyUnicode_GET_LENGTH(item);
9438 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009439 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009440 if (i != 0)
9441 sz += seplen;
9442 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9443 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009445 goto onError;
9446 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009447 if (use_memcpy && last_obj != NULL) {
9448 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9449 use_memcpy = 0;
9450 }
9451 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009452 }
Tim Petersced69f82003-09-16 20:30:58 +00009453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009455 if (res == NULL)
9456 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009457
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009458 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009459#ifdef Py_DEBUG
9460 use_memcpy = 0;
9461#else
9462 if (use_memcpy) {
9463 res_data = PyUnicode_1BYTE_DATA(res);
9464 kind = PyUnicode_KIND(res);
9465 if (seplen != 0)
9466 sep_data = PyUnicode_1BYTE_DATA(sep);
9467 }
9468#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009469 if (use_memcpy) {
9470 for (i = 0; i < seqlen; ++i) {
9471 Py_ssize_t itemlen;
9472 item = items[i];
9473
9474 /* Copy item, and maybe the separator. */
9475 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009476 Py_MEMCPY(res_data,
9477 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009478 kind * seplen);
9479 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009480 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009481
9482 itemlen = PyUnicode_GET_LENGTH(item);
9483 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009484 Py_MEMCPY(res_data,
9485 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009486 kind * itemlen);
9487 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009488 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009489 }
9490 assert(res_data == PyUnicode_1BYTE_DATA(res)
9491 + kind * PyUnicode_GET_LENGTH(res));
9492 }
9493 else {
9494 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9495 Py_ssize_t itemlen;
9496 item = items[i];
9497
9498 /* Copy item, and maybe the separator. */
9499 if (i && seplen != 0) {
9500 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9501 res_offset += seplen;
9502 }
9503
9504 itemlen = PyUnicode_GET_LENGTH(item);
9505 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009506 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009507 res_offset += itemlen;
9508 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009509 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009510 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009511 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009512
Tim Peters05eba1f2004-08-27 21:32:02 +00009513 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009515 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009519 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009521 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 return NULL;
9523}
9524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525#define FILL(kind, data, value, start, length) \
9526 do { \
9527 Py_ssize_t i_ = 0; \
9528 assert(kind != PyUnicode_WCHAR_KIND); \
9529 switch ((kind)) { \
9530 case PyUnicode_1BYTE_KIND: { \
9531 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009532 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 break; \
9534 } \
9535 case PyUnicode_2BYTE_KIND: { \
9536 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9537 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9538 break; \
9539 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009540 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9542 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9543 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009544 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 } \
9546 } \
9547 } while (0)
9548
Victor Stinnerd3f08822012-05-29 12:57:52 +02009549void
9550_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9551 Py_UCS4 fill_char)
9552{
9553 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9554 const void *data = PyUnicode_DATA(unicode);
9555 assert(PyUnicode_IS_READY(unicode));
9556 assert(unicode_modifiable(unicode));
9557 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9558 assert(start >= 0);
9559 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9560 FILL(kind, data, fill_char, start, length);
9561}
9562
Victor Stinner3fe55312012-01-04 00:33:50 +01009563Py_ssize_t
9564PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9565 Py_UCS4 fill_char)
9566{
9567 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009568
9569 if (!PyUnicode_Check(unicode)) {
9570 PyErr_BadInternalCall();
9571 return -1;
9572 }
9573 if (PyUnicode_READY(unicode) == -1)
9574 return -1;
9575 if (unicode_check_modifiable(unicode))
9576 return -1;
9577
Victor Stinnerd3f08822012-05-29 12:57:52 +02009578 if (start < 0) {
9579 PyErr_SetString(PyExc_IndexError, "string index out of range");
9580 return -1;
9581 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009582 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9583 PyErr_SetString(PyExc_ValueError,
9584 "fill character is bigger than "
9585 "the string maximum character");
9586 return -1;
9587 }
9588
9589 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9590 length = Py_MIN(maxlen, length);
9591 if (length <= 0)
9592 return 0;
9593
Victor Stinnerd3f08822012-05-29 12:57:52 +02009594 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009595 return length;
9596}
9597
Victor Stinner9310abb2011-10-05 00:59:23 +02009598static PyObject *
9599pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009600 Py_ssize_t left,
9601 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 PyObject *u;
9605 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009606 int kind;
9607 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608
9609 if (left < 0)
9610 left = 0;
9611 if (right < 0)
9612 right = 0;
9613
Victor Stinnerc4b49542011-12-11 22:44:26 +01009614 if (left == 0 && right == 0)
9615 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9618 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009619 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9620 return NULL;
9621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009623 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009625 if (!u)
9626 return NULL;
9627
9628 kind = PyUnicode_KIND(u);
9629 data = PyUnicode_DATA(u);
9630 if (left)
9631 FILL(kind, data, fill, 0, left);
9632 if (right)
9633 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009634 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009635 assert(_PyUnicode_CheckConsistency(u, 1));
9636 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637}
9638
Alexander Belopolsky40018472011-02-26 01:02:56 +00009639PyObject *
9640PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643
9644 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009645 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009646 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009647 if (PyUnicode_READY(string) == -1) {
9648 Py_DECREF(string);
9649 return NULL;
9650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651
Benjamin Petersonead6b532011-12-20 17:23:42 -06009652 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009654 if (PyUnicode_IS_ASCII(string))
9655 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009656 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009657 PyUnicode_GET_LENGTH(string), keepends);
9658 else
9659 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009660 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009661 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 break;
9663 case PyUnicode_2BYTE_KIND:
9664 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009665 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 PyUnicode_GET_LENGTH(string), keepends);
9667 break;
9668 case PyUnicode_4BYTE_KIND:
9669 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009670 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 PyUnicode_GET_LENGTH(string), keepends);
9672 break;
9673 default:
9674 assert(0);
9675 list = 0;
9676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677 Py_DECREF(string);
9678 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679}
9680
Alexander Belopolsky40018472011-02-26 01:02:56 +00009681static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009682split(PyObject *self,
9683 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009684 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 int kind1, kind2, kind;
9687 void *buf1, *buf2;
9688 Py_ssize_t len1, len2;
9689 PyObject* out;
9690
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009692 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 if (PyUnicode_READY(self) == -1)
9695 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009698 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009700 if (PyUnicode_IS_ASCII(self))
9701 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009702 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009703 PyUnicode_GET_LENGTH(self), maxcount
9704 );
9705 else
9706 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009707 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009708 PyUnicode_GET_LENGTH(self), maxcount
9709 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 case PyUnicode_2BYTE_KIND:
9711 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009712 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 PyUnicode_GET_LENGTH(self), maxcount
9714 );
9715 case PyUnicode_4BYTE_KIND:
9716 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009717 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 PyUnicode_GET_LENGTH(self), maxcount
9719 );
9720 default:
9721 assert(0);
9722 return NULL;
9723 }
9724
9725 if (PyUnicode_READY(substring) == -1)
9726 return NULL;
9727
9728 kind1 = PyUnicode_KIND(self);
9729 kind2 = PyUnicode_KIND(substring);
9730 kind = kind1 > kind2 ? kind1 : kind2;
9731 buf1 = PyUnicode_DATA(self);
9732 buf2 = PyUnicode_DATA(substring);
9733 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009734 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 if (!buf1)
9736 return NULL;
9737 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009738 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 if (!buf2) {
9740 if (kind1 != kind) PyMem_Free(buf1);
9741 return NULL;
9742 }
9743 len1 = PyUnicode_GET_LENGTH(self);
9744 len2 = PyUnicode_GET_LENGTH(substring);
9745
Benjamin Petersonead6b532011-12-20 17:23:42 -06009746 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009748 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9749 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009750 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009751 else
9752 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009753 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 break;
9755 case PyUnicode_2BYTE_KIND:
9756 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009757 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 break;
9759 case PyUnicode_4BYTE_KIND:
9760 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009761 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 break;
9763 default:
9764 out = NULL;
9765 }
9766 if (kind1 != kind)
9767 PyMem_Free(buf1);
9768 if (kind2 != kind)
9769 PyMem_Free(buf2);
9770 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771}
9772
Alexander Belopolsky40018472011-02-26 01:02:56 +00009773static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009774rsplit(PyObject *self,
9775 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009776 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 int kind1, kind2, kind;
9779 void *buf1, *buf2;
9780 Py_ssize_t len1, len2;
9781 PyObject* out;
9782
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009783 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009784 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 if (PyUnicode_READY(self) == -1)
9787 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009790 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009792 if (PyUnicode_IS_ASCII(self))
9793 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009794 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009795 PyUnicode_GET_LENGTH(self), maxcount
9796 );
9797 else
9798 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009799 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009800 PyUnicode_GET_LENGTH(self), maxcount
9801 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 case PyUnicode_2BYTE_KIND:
9803 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009804 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 PyUnicode_GET_LENGTH(self), maxcount
9806 );
9807 case PyUnicode_4BYTE_KIND:
9808 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009809 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 PyUnicode_GET_LENGTH(self), maxcount
9811 );
9812 default:
9813 assert(0);
9814 return NULL;
9815 }
9816
9817 if (PyUnicode_READY(substring) == -1)
9818 return NULL;
9819
9820 kind1 = PyUnicode_KIND(self);
9821 kind2 = PyUnicode_KIND(substring);
9822 kind = kind1 > kind2 ? kind1 : kind2;
9823 buf1 = PyUnicode_DATA(self);
9824 buf2 = PyUnicode_DATA(substring);
9825 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009826 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 if (!buf1)
9828 return NULL;
9829 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 if (!buf2) {
9832 if (kind1 != kind) PyMem_Free(buf1);
9833 return NULL;
9834 }
9835 len1 = PyUnicode_GET_LENGTH(self);
9836 len2 = PyUnicode_GET_LENGTH(substring);
9837
Benjamin Petersonead6b532011-12-20 17:23:42 -06009838 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009840 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9841 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009842 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009843 else
9844 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009845 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 break;
9847 case PyUnicode_2BYTE_KIND:
9848 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009849 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 break;
9851 case PyUnicode_4BYTE_KIND:
9852 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009853 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 break;
9855 default:
9856 out = NULL;
9857 }
9858 if (kind1 != kind)
9859 PyMem_Free(buf1);
9860 if (kind2 != kind)
9861 PyMem_Free(buf2);
9862 return out;
9863}
9864
9865static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009866anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9867 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009869 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009871 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9872 return asciilib_find(buf1, len1, buf2, len2, offset);
9873 else
9874 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 case PyUnicode_2BYTE_KIND:
9876 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9877 case PyUnicode_4BYTE_KIND:
9878 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9879 }
9880 assert(0);
9881 return -1;
9882}
9883
9884static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009885anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9886 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009888 switch (kind) {
9889 case PyUnicode_1BYTE_KIND:
9890 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9891 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9892 else
9893 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9894 case PyUnicode_2BYTE_KIND:
9895 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9896 case PyUnicode_4BYTE_KIND:
9897 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9898 }
9899 assert(0);
9900 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009901}
9902
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009903static void
9904replace_1char_inplace(PyObject *u, Py_ssize_t pos,
9905 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
9906{
9907 int kind = PyUnicode_KIND(u);
9908 void *data = PyUnicode_DATA(u);
9909 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
9910 if (kind == PyUnicode_1BYTE_KIND) {
9911 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
9912 (Py_UCS1 *)data + len,
9913 u1, u2, maxcount);
9914 }
9915 else if (kind == PyUnicode_2BYTE_KIND) {
9916 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
9917 (Py_UCS2 *)data + len,
9918 u1, u2, maxcount);
9919 }
9920 else {
9921 assert(kind == PyUnicode_4BYTE_KIND);
9922 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
9923 (Py_UCS4 *)data + len,
9924 u1, u2, maxcount);
9925 }
9926}
9927
Alexander Belopolsky40018472011-02-26 01:02:56 +00009928static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929replace(PyObject *self, PyObject *str1,
9930 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 PyObject *u;
9933 char *sbuf = PyUnicode_DATA(self);
9934 char *buf1 = PyUnicode_DATA(str1);
9935 char *buf2 = PyUnicode_DATA(str2);
9936 int srelease = 0, release1 = 0, release2 = 0;
9937 int skind = PyUnicode_KIND(self);
9938 int kind1 = PyUnicode_KIND(str1);
9939 int kind2 = PyUnicode_KIND(str2);
9940 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9941 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9942 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009943 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009944 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945
9946 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009949 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950
Victor Stinner59de0ee2011-10-07 10:01:28 +02009951 if (str1 == str2)
9952 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953
Victor Stinner49a0a212011-10-12 23:46:10 +02009954 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009955 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
9956 if (maxchar < maxchar_str1)
9957 /* substring too wide to be present */
9958 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +02009959 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9960 /* Replacing str1 with str2 may cause a maxchar reduction in the
9961 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009962 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Victor Stinnere6abb482012-05-02 01:15:40 +02009963 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009966 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009968 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009969 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009970 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009971 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009972 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +01009973
Victor Stinner69ed0f42013-04-09 21:48:24 +02009974 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009975 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +01009976 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009977 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +02009978 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009980 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +01009982
Serhiy Storchakae2cef882013-04-13 22:45:04 +03009983 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
9984 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +02009985 }
9986 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 int rkind = skind;
9988 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009989 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 if (kind1 < rkind) {
9992 /* widen substring */
9993 buf1 = _PyUnicode_AsKind(str1, rkind);
9994 if (!buf1) goto error;
9995 release1 = 1;
9996 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009997 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009998 if (i < 0)
9999 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 if (rkind > kind2) {
10001 /* widen replacement */
10002 buf2 = _PyUnicode_AsKind(str2, rkind);
10003 if (!buf2) goto error;
10004 release2 = 1;
10005 }
10006 else if (rkind < kind2) {
10007 /* widen self and buf1 */
10008 rkind = kind2;
10009 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010010 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 sbuf = _PyUnicode_AsKind(self, rkind);
10012 if (!sbuf) goto error;
10013 srelease = 1;
10014 buf1 = _PyUnicode_AsKind(str1, rkind);
10015 if (!buf1) goto error;
10016 release1 = 1;
10017 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010018 u = PyUnicode_New(slen, maxchar);
10019 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010021 assert(PyUnicode_KIND(u) == rkind);
10022 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010023
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010024 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010025 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010026 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010028 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010030
10031 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010032 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010033 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010034 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010035 if (i == -1)
10036 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010037 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010039 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010043 }
10044 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010046 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 int rkind = skind;
10048 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010051 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 buf1 = _PyUnicode_AsKind(str1, rkind);
10053 if (!buf1) goto error;
10054 release1 = 1;
10055 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010057 if (n == 0)
10058 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010060 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 buf2 = _PyUnicode_AsKind(str2, rkind);
10062 if (!buf2) goto error;
10063 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010066 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 rkind = kind2;
10068 sbuf = _PyUnicode_AsKind(self, rkind);
10069 if (!sbuf) goto error;
10070 srelease = 1;
10071 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010072 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 buf1 = _PyUnicode_AsKind(str1, rkind);
10074 if (!buf1) goto error;
10075 release1 = 1;
10076 }
10077 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10078 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010079 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 PyErr_SetString(PyExc_OverflowError,
10081 "replace string is too long");
10082 goto error;
10083 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010084 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010085 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010086 _Py_INCREF_UNICODE_EMPTY();
10087 if (!unicode_empty)
10088 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010089 u = unicode_empty;
10090 goto done;
10091 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010092 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 PyErr_SetString(PyExc_OverflowError,
10094 "replace string is too long");
10095 goto error;
10096 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010097 u = PyUnicode_New(new_size, maxchar);
10098 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010100 assert(PyUnicode_KIND(u) == rkind);
10101 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 ires = i = 0;
10103 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010104 while (n-- > 0) {
10105 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010106 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010107 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010108 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010109 if (j == -1)
10110 break;
10111 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010112 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010113 memcpy(res + rkind * ires,
10114 sbuf + rkind * i,
10115 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010117 }
10118 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010120 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010122 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010128 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010129 memcpy(res + rkind * ires,
10130 sbuf + rkind * i,
10131 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010132 }
10133 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010134 /* interleave */
10135 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010136 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010138 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010140 if (--n <= 0)
10141 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010142 memcpy(res + rkind * ires,
10143 sbuf + rkind * i,
10144 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 ires++;
10146 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010147 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010148 memcpy(res + rkind * ires,
10149 sbuf + rkind * i,
10150 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010151 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010152 }
10153
10154 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010155 unicode_adjust_maxchar(&u);
10156 if (u == NULL)
10157 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010159
10160 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 if (srelease)
10162 PyMem_FREE(sbuf);
10163 if (release1)
10164 PyMem_FREE(buf1);
10165 if (release2)
10166 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010167 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010169
Benjamin Peterson29060642009-01-31 22:14:21 +000010170 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010171 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (srelease)
10173 PyMem_FREE(sbuf);
10174 if (release1)
10175 PyMem_FREE(buf1);
10176 if (release2)
10177 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010178 return unicode_result_unchanged(self);
10179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 error:
10181 if (srelease && sbuf)
10182 PyMem_FREE(sbuf);
10183 if (release1 && buf1)
10184 PyMem_FREE(buf1);
10185 if (release2 && buf2)
10186 PyMem_FREE(buf2);
10187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188}
10189
10190/* --- Unicode Object Methods --------------------------------------------- */
10191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010192PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010193 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194\n\
10195Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010196characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197
10198static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010199unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010201 if (PyUnicode_READY(self) == -1)
10202 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010203 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204}
10205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010206PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010207 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208\n\
10209Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010210have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211
10212static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010213unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010215 if (PyUnicode_READY(self) == -1)
10216 return NULL;
10217 if (PyUnicode_GET_LENGTH(self) == 0)
10218 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010219 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220}
10221
Benjamin Petersond5890c82012-01-14 13:23:30 -050010222PyDoc_STRVAR(casefold__doc__,
10223 "S.casefold() -> str\n\
10224\n\
10225Return a version of S suitable for caseless comparisons.");
10226
10227static PyObject *
10228unicode_casefold(PyObject *self)
10229{
10230 if (PyUnicode_READY(self) == -1)
10231 return NULL;
10232 if (PyUnicode_IS_ASCII(self))
10233 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010234 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010235}
10236
10237
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010238/* Argument converter. Coerces to a single unicode character */
10239
10240static int
10241convert_uc(PyObject *obj, void *addr)
10242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010244 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010245
Benjamin Peterson14339b62009-01-31 16:36:08 +000010246 uniobj = PyUnicode_FromObject(obj);
10247 if (uniobj == NULL) {
10248 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010249 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010250 return 0;
10251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010253 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010254 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010255 Py_DECREF(uniobj);
10256 return 0;
10257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010259 Py_DECREF(uniobj);
10260 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010261}
10262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010263PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010264 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010266Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010267done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268
10269static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010270unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010272 Py_ssize_t marg, left;
10273 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 Py_UCS4 fillchar = ' ';
10275
Victor Stinnere9a29352011-10-01 02:14:59 +020010276 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278
Benjamin Petersonbac79492012-01-14 13:34:47 -050010279 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280 return NULL;
10281
Victor Stinnerc4b49542011-12-11 22:44:26 +010010282 if (PyUnicode_GET_LENGTH(self) >= width)
10283 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
Victor Stinnerc4b49542011-12-11 22:44:26 +010010285 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286 left = marg / 2 + (marg & width & 1);
10287
Victor Stinner9310abb2011-10-05 00:59:23 +020010288 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289}
10290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291/* This function assumes that str1 and str2 are readied by the caller. */
10292
Marc-André Lemburge5034372000-08-08 08:04:29 +000010293static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010294unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010295{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010296#define COMPARE(TYPE1, TYPE2) \
10297 do { \
10298 TYPE1* p1 = (TYPE1 *)data1; \
10299 TYPE2* p2 = (TYPE2 *)data2; \
10300 TYPE1* end = p1 + len; \
10301 Py_UCS4 c1, c2; \
10302 for (; p1 != end; p1++, p2++) { \
10303 c1 = *p1; \
10304 c2 = *p2; \
10305 if (c1 != c2) \
10306 return (c1 < c2) ? -1 : 1; \
10307 } \
10308 } \
10309 while (0)
10310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 int kind1, kind2;
10312 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010313 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010314
Victor Stinner90db9c42012-10-04 21:53:50 +020010315 /* a string is equal to itself */
10316 if (str1 == str2)
10317 return 0;
10318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 kind1 = PyUnicode_KIND(str1);
10320 kind2 = PyUnicode_KIND(str2);
10321 data1 = PyUnicode_DATA(str1);
10322 data2 = PyUnicode_DATA(str2);
10323 len1 = PyUnicode_GET_LENGTH(str1);
10324 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010325 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010326
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010327 switch(kind1) {
10328 case PyUnicode_1BYTE_KIND:
10329 {
10330 switch(kind2) {
10331 case PyUnicode_1BYTE_KIND:
10332 {
10333 int cmp = memcmp(data1, data2, len);
10334 /* normalize result of memcmp() into the range [-1; 1] */
10335 if (cmp < 0)
10336 return -1;
10337 if (cmp > 0)
10338 return 1;
10339 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010340 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010341 case PyUnicode_2BYTE_KIND:
10342 COMPARE(Py_UCS1, Py_UCS2);
10343 break;
10344 case PyUnicode_4BYTE_KIND:
10345 COMPARE(Py_UCS1, Py_UCS4);
10346 break;
10347 default:
10348 assert(0);
10349 }
10350 break;
10351 }
10352 case PyUnicode_2BYTE_KIND:
10353 {
10354 switch(kind2) {
10355 case PyUnicode_1BYTE_KIND:
10356 COMPARE(Py_UCS2, Py_UCS1);
10357 break;
10358 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010359 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010360 COMPARE(Py_UCS2, Py_UCS2);
10361 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010362 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010363 case PyUnicode_4BYTE_KIND:
10364 COMPARE(Py_UCS2, Py_UCS4);
10365 break;
10366 default:
10367 assert(0);
10368 }
10369 break;
10370 }
10371 case PyUnicode_4BYTE_KIND:
10372 {
10373 switch(kind2) {
10374 case PyUnicode_1BYTE_KIND:
10375 COMPARE(Py_UCS4, Py_UCS1);
10376 break;
10377 case PyUnicode_2BYTE_KIND:
10378 COMPARE(Py_UCS4, Py_UCS2);
10379 break;
10380 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010381 {
10382#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10383 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10384 /* normalize result of wmemcmp() into the range [-1; 1] */
10385 if (cmp < 0)
10386 return -1;
10387 if (cmp > 0)
10388 return 1;
10389#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010390 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010391#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010392 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010393 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010394 default:
10395 assert(0);
10396 }
10397 break;
10398 }
10399 default:
10400 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010401 }
10402
Victor Stinner770e19e2012-10-04 22:59:45 +020010403 if (len1 == len2)
10404 return 0;
10405 if (len1 < len2)
10406 return -1;
10407 else
10408 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010409
10410#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010411}
10412
Victor Stinnere5567ad2012-10-23 02:48:49 +020010413static int
10414unicode_compare_eq(PyObject *str1, PyObject *str2)
10415{
10416 int kind;
10417 void *data1, *data2;
10418 Py_ssize_t len;
10419 int cmp;
10420
10421 /* a string is equal to itself */
10422 if (str1 == str2)
10423 return 1;
10424
10425 len = PyUnicode_GET_LENGTH(str1);
10426 if (PyUnicode_GET_LENGTH(str2) != len)
10427 return 0;
10428 kind = PyUnicode_KIND(str1);
10429 if (PyUnicode_KIND(str2) != kind)
10430 return 0;
10431 data1 = PyUnicode_DATA(str1);
10432 data2 = PyUnicode_DATA(str2);
10433
10434 cmp = memcmp(data1, data2, len * kind);
10435 return (cmp == 0);
10436}
10437
10438
Alexander Belopolsky40018472011-02-26 01:02:56 +000010439int
10440PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10443 if (PyUnicode_READY(left) == -1 ||
10444 PyUnicode_READY(right) == -1)
10445 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010446 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010448 PyErr_Format(PyExc_TypeError,
10449 "Can't compare %.100s and %.100s",
10450 left->ob_type->tp_name,
10451 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452 return -1;
10453}
10454
Martin v. Löwis5b222132007-06-10 09:51:05 +000010455int
10456PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 Py_ssize_t i;
10459 int kind;
10460 void *data;
10461 Py_UCS4 chr;
10462
Victor Stinner910337b2011-10-03 03:20:16 +020010463 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (PyUnicode_READY(uni) == -1)
10465 return -1;
10466 kind = PyUnicode_KIND(uni);
10467 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010468 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10470 if (chr != str[i])
10471 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010472 /* This check keeps Python strings that end in '\0' from comparing equal
10473 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010475 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010476 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010477 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010478 return 0;
10479}
10480
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010481
Benjamin Peterson29060642009-01-31 22:14:21 +000010482#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010483 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010484
Alexander Belopolsky40018472011-02-26 01:02:56 +000010485PyObject *
10486PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010487{
10488 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010489 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010490
Victor Stinnere5567ad2012-10-23 02:48:49 +020010491 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10492 Py_RETURN_NOTIMPLEMENTED;
10493
10494 if (PyUnicode_READY(left) == -1 ||
10495 PyUnicode_READY(right) == -1)
10496 return NULL;
10497
10498 if (op == Py_EQ || op == Py_NE) {
10499 result = unicode_compare_eq(left, right);
10500 if (op == Py_EQ)
10501 v = TEST_COND(result);
10502 else
10503 v = TEST_COND(!result);
10504 }
10505 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010506 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010507
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010508 /* Convert the return value to a Boolean */
10509 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010510 case Py_LE:
10511 v = TEST_COND(result <= 0);
10512 break;
10513 case Py_GE:
10514 v = TEST_COND(result >= 0);
10515 break;
10516 case Py_LT:
10517 v = TEST_COND(result == -1);
10518 break;
10519 case Py_GT:
10520 v = TEST_COND(result == 1);
10521 break;
10522 default:
10523 PyErr_BadArgument();
10524 return NULL;
10525 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010526 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010527 Py_INCREF(v);
10528 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010529}
10530
Alexander Belopolsky40018472011-02-26 01:02:56 +000010531int
10532PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010533{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010534 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 int kind1, kind2, kind;
10536 void *buf1, *buf2;
10537 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010538 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010539
10540 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 sub = PyUnicode_FromObject(element);
10542 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 PyErr_Format(PyExc_TypeError,
10544 "'in <string>' requires string as left operand, not %s",
10545 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010546 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010547 }
10548
Thomas Wouters477c8d52006-05-27 19:21:47 +000010549 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010550 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010551 Py_DECREF(sub);
10552 return -1;
10553 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010554 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10555 Py_DECREF(sub);
10556 Py_DECREF(str);
10557 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 kind1 = PyUnicode_KIND(str);
10560 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010561 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 buf1 = PyUnicode_DATA(str);
10563 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010564 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010565 if (kind2 > kind) {
10566 Py_DECREF(sub);
10567 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010568 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010569 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010570 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 if (!buf2) {
10573 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010574 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 return -1;
10576 }
10577 len1 = PyUnicode_GET_LENGTH(str);
10578 len2 = PyUnicode_GET_LENGTH(sub);
10579
Benjamin Petersonead6b532011-12-20 17:23:42 -060010580 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 case PyUnicode_1BYTE_KIND:
10582 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10583 break;
10584 case PyUnicode_2BYTE_KIND:
10585 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10586 break;
10587 case PyUnicode_4BYTE_KIND:
10588 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10589 break;
10590 default:
10591 result = -1;
10592 assert(0);
10593 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010594
10595 Py_DECREF(str);
10596 Py_DECREF(sub);
10597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 if (kind2 != kind)
10599 PyMem_Free(buf2);
10600
Guido van Rossum403d68b2000-03-13 15:55:09 +000010601 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010602}
10603
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604/* Concat to string or Unicode object giving a new Unicode object. */
10605
Alexander Belopolsky40018472011-02-26 01:02:56 +000010606PyObject *
10607PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010610 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010611 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612
10613 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010616 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010619 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620
10621 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010622 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010623 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010626 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010627 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 }
10630
Victor Stinner488fa492011-12-12 00:01:39 +010010631 u_len = PyUnicode_GET_LENGTH(u);
10632 v_len = PyUnicode_GET_LENGTH(v);
10633 if (u_len > PY_SSIZE_T_MAX - v_len) {
10634 PyErr_SetString(PyExc_OverflowError,
10635 "strings are too large to concat");
10636 goto onError;
10637 }
10638 new_len = u_len + v_len;
10639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010641 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010642 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010645 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010648 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10649 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010650 Py_DECREF(u);
10651 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010652 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654
Benjamin Peterson29060642009-01-31 22:14:21 +000010655 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010656 Py_XDECREF(u);
10657 Py_XDECREF(v);
10658 return NULL;
10659}
10660
Walter Dörwald1ab83302007-05-18 17:15:44 +000010661void
Victor Stinner23e56682011-10-03 03:54:37 +020010662PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010663{
Victor Stinner23e56682011-10-03 03:54:37 +020010664 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010665 Py_UCS4 maxchar, maxchar2;
10666 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010667
10668 if (p_left == NULL) {
10669 if (!PyErr_Occurred())
10670 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010671 return;
10672 }
Victor Stinner23e56682011-10-03 03:54:37 +020010673 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010674 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010675 if (!PyErr_Occurred())
10676 PyErr_BadInternalCall();
10677 goto error;
10678 }
10679
Benjamin Petersonbac79492012-01-14 13:34:47 -050010680 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010681 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010682 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010683 goto error;
10684
Victor Stinner488fa492011-12-12 00:01:39 +010010685 /* Shortcuts */
10686 if (left == unicode_empty) {
10687 Py_DECREF(left);
10688 Py_INCREF(right);
10689 *p_left = right;
10690 return;
10691 }
10692 if (right == unicode_empty)
10693 return;
10694
10695 left_len = PyUnicode_GET_LENGTH(left);
10696 right_len = PyUnicode_GET_LENGTH(right);
10697 if (left_len > PY_SSIZE_T_MAX - right_len) {
10698 PyErr_SetString(PyExc_OverflowError,
10699 "strings are too large to concat");
10700 goto error;
10701 }
10702 new_len = left_len + right_len;
10703
10704 if (unicode_modifiable(left)
10705 && PyUnicode_CheckExact(right)
10706 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010707 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10708 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010709 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010710 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010711 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10712 {
10713 /* append inplace */
10714 if (unicode_resize(p_left, new_len) != 0) {
10715 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10716 * deallocated so it cannot be put back into
10717 * 'variable'. The MemoryError is raised when there
10718 * is no value in 'variable', which might (very
10719 * remotely) be a cause of incompatibilities.
10720 */
10721 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010722 }
Victor Stinner488fa492011-12-12 00:01:39 +010010723 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010724 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010725 }
Victor Stinner488fa492011-12-12 00:01:39 +010010726 else {
10727 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10728 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010729 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010730
Victor Stinner488fa492011-12-12 00:01:39 +010010731 /* Concat the two Unicode strings */
10732 res = PyUnicode_New(new_len, maxchar);
10733 if (res == NULL)
10734 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010735 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10736 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010737 Py_DECREF(left);
10738 *p_left = res;
10739 }
10740 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010741 return;
10742
10743error:
Victor Stinner488fa492011-12-12 00:01:39 +010010744 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010745}
10746
10747void
10748PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10749{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010750 PyUnicode_Append(pleft, right);
10751 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010752}
10753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010754PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010755 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010757Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010758string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010759interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760
10761static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010762unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010764 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010765 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010766 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 int kind1, kind2, kind;
10769 void *buf1, *buf2;
10770 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771
Jesus Ceaac451502011-04-20 17:09:23 +020010772 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10773 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010774 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 kind1 = PyUnicode_KIND(self);
10777 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010778 if (kind2 > kind1)
10779 return PyLong_FromLong(0);
10780 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 buf1 = PyUnicode_DATA(self);
10782 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010784 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 if (!buf2) {
10786 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 return NULL;
10788 }
10789 len1 = PyUnicode_GET_LENGTH(self);
10790 len2 = PyUnicode_GET_LENGTH(substring);
10791
10792 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010793 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 case PyUnicode_1BYTE_KIND:
10795 iresult = ucs1lib_count(
10796 ((Py_UCS1*)buf1) + start, end - start,
10797 buf2, len2, PY_SSIZE_T_MAX
10798 );
10799 break;
10800 case PyUnicode_2BYTE_KIND:
10801 iresult = ucs2lib_count(
10802 ((Py_UCS2*)buf1) + start, end - start,
10803 buf2, len2, PY_SSIZE_T_MAX
10804 );
10805 break;
10806 case PyUnicode_4BYTE_KIND:
10807 iresult = ucs4lib_count(
10808 ((Py_UCS4*)buf1) + start, end - start,
10809 buf2, len2, PY_SSIZE_T_MAX
10810 );
10811 break;
10812 default:
10813 assert(0); iresult = 0;
10814 }
10815
10816 result = PyLong_FromSsize_t(iresult);
10817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 if (kind2 != kind)
10819 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820
10821 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010822
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823 return result;
10824}
10825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010826PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010827 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010829Encode S using the codec registered for encoding. Default encoding\n\
10830is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010831handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010832a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10833'xmlcharrefreplace' as well as any other name registered with\n\
10834codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
10836static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010837unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010839 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840 char *encoding = NULL;
10841 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010842
Benjamin Peterson308d6372009-09-18 21:42:35 +000010843 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10844 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010846 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010847}
10848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010849PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010850 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851\n\
10852Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010853If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854
10855static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010856unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010858 Py_ssize_t i, j, line_pos, src_len, incr;
10859 Py_UCS4 ch;
10860 PyObject *u;
10861 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010863 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010864 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865
10866 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
Antoine Pitrou22425222011-10-04 19:10:51 +020010869 if (PyUnicode_READY(self) == -1)
10870 return NULL;
10871
Thomas Wouters7e474022000-07-16 12:04:32 +000010872 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010873 src_len = PyUnicode_GET_LENGTH(self);
10874 i = j = line_pos = 0;
10875 kind = PyUnicode_KIND(self);
10876 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010877 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010878 for (; i < src_len; i++) {
10879 ch = PyUnicode_READ(kind, src_data, i);
10880 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010881 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010882 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010883 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010884 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010885 goto overflow;
10886 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010888 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010891 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010892 goto overflow;
10893 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 if (ch == '\n' || ch == '\r')
10896 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010898 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010899 if (!found)
10900 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010901
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010903 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904 if (!u)
10905 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010906 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907
Antoine Pitroue71d5742011-10-04 15:55:09 +020010908 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909
Antoine Pitroue71d5742011-10-04 15:55:09 +020010910 for (; i < src_len; i++) {
10911 ch = PyUnicode_READ(kind, src_data, i);
10912 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010914 incr = tabsize - (line_pos % tabsize);
10915 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010916 FILL(kind, dest_data, ' ', j, incr);
10917 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010918 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010919 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010920 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010921 line_pos++;
10922 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010923 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010924 if (ch == '\n' || ch == '\r')
10925 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010927 }
10928 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010929 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010930
Antoine Pitroue71d5742011-10-04 15:55:09 +020010931 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010932 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934}
10935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010936PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010937 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938\n\
10939Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010940such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941arguments start and end are interpreted as in slice notation.\n\
10942\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010943Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
10945static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010948 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010949 Py_ssize_t start;
10950 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010951 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952
Jesus Ceaac451502011-04-20 17:09:23 +020010953 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10954 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 if (PyUnicode_READY(self) == -1)
10958 return NULL;
10959 if (PyUnicode_READY(substring) == -1)
10960 return NULL;
10961
Victor Stinner7931d9a2011-11-04 00:22:48 +010010962 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963
10964 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (result == -2)
10967 return NULL;
10968
Christian Heimes217cfd12007-12-02 14:31:20 +000010969 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970}
10971
10972static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010973unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010975 void *data;
10976 enum PyUnicode_Kind kind;
10977 Py_UCS4 ch;
10978 PyObject *res;
10979
10980 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10981 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010983 }
10984 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10985 PyErr_SetString(PyExc_IndexError, "string index out of range");
10986 return NULL;
10987 }
10988 kind = PyUnicode_KIND(self);
10989 data = PyUnicode_DATA(self);
10990 ch = PyUnicode_READ(kind, data, index);
10991 if (ch < 256)
10992 return get_latin1_char(ch);
10993
10994 res = PyUnicode_New(1, ch);
10995 if (res == NULL)
10996 return NULL;
10997 kind = PyUnicode_KIND(res);
10998 data = PyUnicode_DATA(res);
10999 PyUnicode_WRITE(kind, data, 0, ch);
11000 assert(_PyUnicode_CheckConsistency(res, 1));
11001 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002}
11003
Guido van Rossumc2504932007-09-18 19:42:40 +000011004/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011005 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011006static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011007unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008{
Guido van Rossumc2504932007-09-18 19:42:40 +000011009 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011010 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011011
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011012#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011013 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011014#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 if (_PyUnicode_HASH(self) != -1)
11016 return _PyUnicode_HASH(self);
11017 if (PyUnicode_READY(self) == -1)
11018 return -1;
11019 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011020 /*
11021 We make the hash of the empty string be 0, rather than using
11022 (prefix ^ suffix), since this slightly obfuscates the hash secret
11023 */
11024 if (len == 0) {
11025 _PyUnicode_HASH(self) = 0;
11026 return 0;
11027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028
11029 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011030#define HASH(P) \
11031 x ^= (Py_uhash_t) *P << 7; \
11032 while (--len >= 0) \
11033 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034
Georg Brandl2fb477c2012-02-21 00:33:36 +010011035 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 switch (PyUnicode_KIND(self)) {
11037 case PyUnicode_1BYTE_KIND: {
11038 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11039 HASH(c);
11040 break;
11041 }
11042 case PyUnicode_2BYTE_KIND: {
11043 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11044 HASH(s);
11045 break;
11046 }
11047 default: {
11048 Py_UCS4 *l;
11049 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11050 "Impossible switch case in unicode_hash");
11051 l = PyUnicode_4BYTE_DATA(self);
11052 HASH(l);
11053 break;
11054 }
11055 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011056 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11057 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011058
Guido van Rossumc2504932007-09-18 19:42:40 +000011059 if (x == -1)
11060 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011062 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011066PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011067 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011069Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070
11071static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011074 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011075 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011076 Py_ssize_t start;
11077 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078
Jesus Ceaac451502011-04-20 17:09:23 +020011079 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11080 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 if (PyUnicode_READY(self) == -1)
11084 return NULL;
11085 if (PyUnicode_READY(substring) == -1)
11086 return NULL;
11087
Victor Stinner7931d9a2011-11-04 00:22:48 +010011088 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089
11090 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 if (result == -2)
11093 return NULL;
11094
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095 if (result < 0) {
11096 PyErr_SetString(PyExc_ValueError, "substring not found");
11097 return NULL;
11098 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011099
Christian Heimes217cfd12007-12-02 14:31:20 +000011100 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101}
11102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011103PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011104 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011106Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011107at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108
11109static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011110unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 Py_ssize_t i, length;
11113 int kind;
11114 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 int cased;
11116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117 if (PyUnicode_READY(self) == -1)
11118 return NULL;
11119 length = PyUnicode_GET_LENGTH(self);
11120 kind = PyUnicode_KIND(self);
11121 data = PyUnicode_DATA(self);
11122
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 if (length == 1)
11125 return PyBool_FromLong(
11126 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011128 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011130 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011131
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 for (i = 0; i < length; i++) {
11134 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011135
Benjamin Peterson29060642009-01-31 22:14:21 +000011136 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11137 return PyBool_FromLong(0);
11138 else if (!cased && Py_UNICODE_ISLOWER(ch))
11139 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011141 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142}
11143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011144PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011147Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011148at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149
11150static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011151unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 Py_ssize_t i, length;
11154 int kind;
11155 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156 int cased;
11157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 if (PyUnicode_READY(self) == -1)
11159 return NULL;
11160 length = PyUnicode_GET_LENGTH(self);
11161 kind = PyUnicode_KIND(self);
11162 data = PyUnicode_DATA(self);
11163
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 if (length == 1)
11166 return PyBool_FromLong(
11167 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011169 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011171 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011172
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 for (i = 0; i < length; i++) {
11175 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011176
Benjamin Peterson29060642009-01-31 22:14:21 +000011177 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11178 return PyBool_FromLong(0);
11179 else if (!cased && Py_UNICODE_ISUPPER(ch))
11180 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011182 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183}
11184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011185PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011188Return True if S is a titlecased string and there is at least one\n\
11189character in S, i.e. upper- and titlecase characters may only\n\
11190follow uncased characters and lowercase characters only cased ones.\n\
11191Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
11193static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011194unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 Py_ssize_t i, length;
11197 int kind;
11198 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 int cased, previous_is_cased;
11200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 if (PyUnicode_READY(self) == -1)
11202 return NULL;
11203 length = PyUnicode_GET_LENGTH(self);
11204 kind = PyUnicode_KIND(self);
11205 data = PyUnicode_DATA(self);
11206
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (length == 1) {
11209 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11210 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11211 (Py_UNICODE_ISUPPER(ch) != 0));
11212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011214 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011216 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011217
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218 cased = 0;
11219 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 for (i = 0; i < length; i++) {
11221 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011222
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11224 if (previous_is_cased)
11225 return PyBool_FromLong(0);
11226 previous_is_cased = 1;
11227 cased = 1;
11228 }
11229 else if (Py_UNICODE_ISLOWER(ch)) {
11230 if (!previous_is_cased)
11231 return PyBool_FromLong(0);
11232 previous_is_cased = 1;
11233 cased = 1;
11234 }
11235 else
11236 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011238 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239}
11240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011241PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011242 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011244Return True if all characters in S are whitespace\n\
11245and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246
11247static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011248unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 Py_ssize_t i, length;
11251 int kind;
11252 void *data;
11253
11254 if (PyUnicode_READY(self) == -1)
11255 return NULL;
11256 length = PyUnicode_GET_LENGTH(self);
11257 kind = PyUnicode_KIND(self);
11258 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 if (length == 1)
11262 return PyBool_FromLong(
11263 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011265 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 for (i = 0; i < length; i++) {
11270 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011271 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011274 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275}
11276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011277PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011279\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011280Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011281and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011282
11283static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011284unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 Py_ssize_t i, length;
11287 int kind;
11288 void *data;
11289
11290 if (PyUnicode_READY(self) == -1)
11291 return NULL;
11292 length = PyUnicode_GET_LENGTH(self);
11293 kind = PyUnicode_KIND(self);
11294 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011295
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011296 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 if (length == 1)
11298 return PyBool_FromLong(
11299 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011300
11301 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 for (i = 0; i < length; i++) {
11306 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011308 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011309 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011310}
11311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011312PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011313 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011314\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011315Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011316and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011317
11318static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011319unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 int kind;
11322 void *data;
11323 Py_ssize_t len, i;
11324
11325 if (PyUnicode_READY(self) == -1)
11326 return NULL;
11327
11328 kind = PyUnicode_KIND(self);
11329 data = PyUnicode_DATA(self);
11330 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011331
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011332 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 if (len == 1) {
11334 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11335 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11336 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011337
11338 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011340 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 for (i = 0; i < len; i++) {
11343 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011344 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011345 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011346 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011347 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011348}
11349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011350PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011351 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011353Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011354False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355
11356static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011357unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 Py_ssize_t i, length;
11360 int kind;
11361 void *data;
11362
11363 if (PyUnicode_READY(self) == -1)
11364 return NULL;
11365 length = PyUnicode_GET_LENGTH(self);
11366 kind = PyUnicode_KIND(self);
11367 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 if (length == 1)
11371 return PyBool_FromLong(
11372 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011374 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 for (i = 0; i < length; i++) {
11379 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011382 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383}
11384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011385PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011388Return True if all characters in S are digits\n\
11389and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390
11391static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011392unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 Py_ssize_t i, length;
11395 int kind;
11396 void *data;
11397
11398 if (PyUnicode_READY(self) == -1)
11399 return NULL;
11400 length = PyUnicode_GET_LENGTH(self);
11401 kind = PyUnicode_KIND(self);
11402 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 if (length == 1) {
11406 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11407 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011410 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011412 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 for (i = 0; i < length; i++) {
11415 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011416 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011418 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419}
11420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011421PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011422 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011424Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011425False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
11427static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011428unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 Py_ssize_t i, length;
11431 int kind;
11432 void *data;
11433
11434 if (PyUnicode_READY(self) == -1)
11435 return NULL;
11436 length = PyUnicode_GET_LENGTH(self);
11437 kind = PyUnicode_KIND(self);
11438 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 if (length == 1)
11442 return PyBool_FromLong(
11443 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011445 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011447 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 for (i = 0; i < length; i++) {
11450 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011453 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454}
11455
Martin v. Löwis47383402007-08-15 07:32:56 +000011456int
11457PyUnicode_IsIdentifier(PyObject *self)
11458{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 int kind;
11460 void *data;
11461 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011462 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 if (PyUnicode_READY(self) == -1) {
11465 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011466 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 }
11468
11469 /* Special case for empty strings */
11470 if (PyUnicode_GET_LENGTH(self) == 0)
11471 return 0;
11472 kind = PyUnicode_KIND(self);
11473 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011474
11475 /* PEP 3131 says that the first character must be in
11476 XID_Start and subsequent characters in XID_Continue,
11477 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011478 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011479 letters, digits, underscore). However, given the current
11480 definition of XID_Start and XID_Continue, it is sufficient
11481 to check just for these, except that _ must be allowed
11482 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011484 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011485 return 0;
11486
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011487 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011490 return 1;
11491}
11492
11493PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011495\n\
11496Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011497to the language definition.\n\
11498\n\
11499Use keyword.iskeyword() to test for reserved identifiers\n\
11500such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011501
11502static PyObject*
11503unicode_isidentifier(PyObject *self)
11504{
11505 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11506}
11507
Georg Brandl559e5d72008-06-11 18:37:52 +000011508PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011510\n\
11511Return True if all characters in S are considered\n\
11512printable in repr() or S is empty, False otherwise.");
11513
11514static PyObject*
11515unicode_isprintable(PyObject *self)
11516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 Py_ssize_t i, length;
11518 int kind;
11519 void *data;
11520
11521 if (PyUnicode_READY(self) == -1)
11522 return NULL;
11523 length = PyUnicode_GET_LENGTH(self);
11524 kind = PyUnicode_KIND(self);
11525 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011526
11527 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 if (length == 1)
11529 return PyBool_FromLong(
11530 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 for (i = 0; i < length; i++) {
11533 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011534 Py_RETURN_FALSE;
11535 }
11536 }
11537 Py_RETURN_TRUE;
11538}
11539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011540PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011541 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542\n\
11543Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011544iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
11546static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011547unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011549 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550}
11551
Martin v. Löwis18e16552006-02-15 17:27:45 +000011552static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011553unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 if (PyUnicode_READY(self) == -1)
11556 return -1;
11557 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558}
11559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011560PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011563Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011564done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565
11566static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011567unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011569 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 Py_UCS4 fillchar = ' ';
11571
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011572 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 return NULL;
11574
Benjamin Petersonbac79492012-01-14 13:34:47 -050011575 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Victor Stinnerc4b49542011-12-11 22:44:26 +010011578 if (PyUnicode_GET_LENGTH(self) >= width)
11579 return unicode_result_unchanged(self);
11580
11581 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582}
11583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011584PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011587Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588
11589static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011590unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011592 if (PyUnicode_READY(self) == -1)
11593 return NULL;
11594 if (PyUnicode_IS_ASCII(self))
11595 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011596 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597}
11598
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011599#define LEFTSTRIP 0
11600#define RIGHTSTRIP 1
11601#define BOTHSTRIP 2
11602
11603/* Arrays indexed by above */
11604static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11605
11606#define STRIPNAME(i) (stripformat[i]+3)
11607
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011608/* externally visible for str.strip(unicode) */
11609PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011610_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 void *data;
11613 int kind;
11614 Py_ssize_t i, j, len;
11615 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011616 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11619 return NULL;
11620
11621 kind = PyUnicode_KIND(self);
11622 data = PyUnicode_DATA(self);
11623 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011624 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11626 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011627 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011628
Benjamin Peterson14339b62009-01-31 16:36:08 +000011629 i = 0;
11630 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011631 while (i < len) {
11632 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11633 if (!BLOOM(sepmask, ch))
11634 break;
11635 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11636 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 i++;
11638 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011639 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011640
Benjamin Peterson14339b62009-01-31 16:36:08 +000011641 j = len;
11642 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011643 j--;
11644 while (j >= i) {
11645 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11646 if (!BLOOM(sepmask, ch))
11647 break;
11648 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11649 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011651 }
11652
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011654 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011655
Victor Stinner7931d9a2011-11-04 00:22:48 +010011656 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657}
11658
11659PyObject*
11660PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11661{
11662 unsigned char *data;
11663 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011664 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665
Victor Stinnerde636f32011-10-01 03:55:54 +020011666 if (PyUnicode_READY(self) == -1)
11667 return NULL;
11668
Victor Stinner684d5fd2012-05-03 02:32:34 +020011669 length = PyUnicode_GET_LENGTH(self);
11670 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011671
Victor Stinner684d5fd2012-05-03 02:32:34 +020011672 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011673 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674
Victor Stinnerde636f32011-10-01 03:55:54 +020011675 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011676 PyErr_SetString(PyExc_IndexError, "string index out of range");
11677 return NULL;
11678 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011679 if (start >= length || end < start)
11680 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011681
Victor Stinner684d5fd2012-05-03 02:32:34 +020011682 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011683 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011684 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011685 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011686 }
11687 else {
11688 kind = PyUnicode_KIND(self);
11689 data = PyUnicode_1BYTE_DATA(self);
11690 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011691 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011692 length);
11693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
11696static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011697do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 Py_ssize_t len, i, j;
11700
11701 if (PyUnicode_READY(self) == -1)
11702 return NULL;
11703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011705
Victor Stinnercc7af722013-04-09 22:39:24 +020011706 if (PyUnicode_IS_ASCII(self)) {
11707 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11708
11709 i = 0;
11710 if (striptype != RIGHTSTRIP) {
11711 while (i < len) {
11712 Py_UCS4 ch = data[i];
11713 if (!_Py_ascii_whitespace[ch])
11714 break;
11715 i++;
11716 }
11717 }
11718
11719 j = len;
11720 if (striptype != LEFTSTRIP) {
11721 j--;
11722 while (j >= i) {
11723 Py_UCS4 ch = data[j];
11724 if (!_Py_ascii_whitespace[ch])
11725 break;
11726 j--;
11727 }
11728 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011729 }
11730 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011731 else {
11732 int kind = PyUnicode_KIND(self);
11733 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734
Victor Stinnercc7af722013-04-09 22:39:24 +020011735 i = 0;
11736 if (striptype != RIGHTSTRIP) {
11737 while (i < len) {
11738 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11739 if (!Py_UNICODE_ISSPACE(ch))
11740 break;
11741 i++;
11742 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011743 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011744
11745 j = len;
11746 if (striptype != LEFTSTRIP) {
11747 j--;
11748 while (j >= i) {
11749 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11750 if (!Py_UNICODE_ISSPACE(ch))
11751 break;
11752 j--;
11753 }
11754 j++;
11755 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011756 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757
Victor Stinner7931d9a2011-11-04 00:22:48 +010011758 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011761
11762static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011763do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011764{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011765 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011766
Benjamin Peterson14339b62009-01-31 16:36:08 +000011767 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11768 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011769
Benjamin Peterson14339b62009-01-31 16:36:08 +000011770 if (sep != NULL && sep != Py_None) {
11771 if (PyUnicode_Check(sep))
11772 return _PyUnicode_XStrip(self, striptype, sep);
11773 else {
11774 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 "%s arg must be None or str",
11776 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011777 return NULL;
11778 }
11779 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011780
Benjamin Peterson14339b62009-01-31 16:36:08 +000011781 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782}
11783
11784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011785PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787\n\
11788Return a copy of the string S with leading and trailing\n\
11789whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011790If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791
11792static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011793unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011794{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 if (PyTuple_GET_SIZE(args) == 0)
11796 return do_strip(self, BOTHSTRIP); /* Common case */
11797 else
11798 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011799}
11800
11801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011802PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011803 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011804\n\
11805Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011806If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807
11808static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011809unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011811 if (PyTuple_GET_SIZE(args) == 0)
11812 return do_strip(self, LEFTSTRIP); /* Common case */
11813 else
11814 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815}
11816
11817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011818PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011820\n\
11821Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011822If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011823
11824static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011825unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011826{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011827 if (PyTuple_GET_SIZE(args) == 0)
11828 return do_strip(self, RIGHTSTRIP); /* Common case */
11829 else
11830 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011831}
11832
11833
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011835unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011837 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
Serhiy Storchaka05997252013-01-26 12:14:02 +020011840 if (len < 1)
11841 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842
Victor Stinnerc4b49542011-12-11 22:44:26 +010011843 /* no repeat, return original string */
11844 if (len == 1)
11845 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011846
Benjamin Petersonbac79492012-01-14 13:34:47 -050011847 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 return NULL;
11849
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011850 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011851 PyErr_SetString(PyExc_OverflowError,
11852 "repeated string is too long");
11853 return NULL;
11854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011856
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011857 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858 if (!u)
11859 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011860 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 if (PyUnicode_GET_LENGTH(str) == 1) {
11863 const int kind = PyUnicode_KIND(str);
11864 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011865 if (kind == PyUnicode_1BYTE_KIND) {
11866 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011867 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011868 }
11869 else if (kind == PyUnicode_2BYTE_KIND) {
11870 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011871 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011872 ucs2[n] = fill_char;
11873 } else {
11874 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11875 assert(kind == PyUnicode_4BYTE_KIND);
11876 for (n = 0; n < len; ++n)
11877 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011878 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 }
11880 else {
11881 /* number of characters copied this far */
11882 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011883 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 char *to = (char *) PyUnicode_DATA(u);
11885 Py_MEMCPY(to, PyUnicode_DATA(str),
11886 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011887 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 n = (done <= nchars-done) ? done : nchars-done;
11889 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011890 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 }
11893
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011894 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011895 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896}
11897
Alexander Belopolsky40018472011-02-26 01:02:56 +000011898PyObject *
11899PyUnicode_Replace(PyObject *obj,
11900 PyObject *subobj,
11901 PyObject *replobj,
11902 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903{
11904 PyObject *self;
11905 PyObject *str1;
11906 PyObject *str2;
11907 PyObject *result;
11908
11909 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011910 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011913 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 Py_DECREF(self);
11915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 }
11917 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011918 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 Py_DECREF(self);
11920 Py_DECREF(str1);
11921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011923 if (PyUnicode_READY(self) == -1 ||
11924 PyUnicode_READY(str1) == -1 ||
11925 PyUnicode_READY(str2) == -1)
11926 result = NULL;
11927 else
11928 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929 Py_DECREF(self);
11930 Py_DECREF(str1);
11931 Py_DECREF(str2);
11932 return result;
11933}
11934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011935PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011936 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937\n\
11938Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011939old replaced by new. If the optional argument count is\n\
11940given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941
11942static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 PyObject *str1;
11946 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011947 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 PyObject *result;
11949
Martin v. Löwis18e16552006-02-15 17:27:45 +000011950 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011952 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011955 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 return NULL;
11957 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011958 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 Py_DECREF(str1);
11960 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011961 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011962 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11963 result = NULL;
11964 else
11965 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966
11967 Py_DECREF(str1);
11968 Py_DECREF(str2);
11969 return result;
11970}
11971
Alexander Belopolsky40018472011-02-26 01:02:56 +000011972static PyObject *
11973unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011975 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 Py_ssize_t isize;
11977 Py_ssize_t osize, squote, dquote, i, o;
11978 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020011979 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011983 return NULL;
11984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 isize = PyUnicode_GET_LENGTH(unicode);
11986 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 /* Compute length of output, quote characters, and
11989 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020011990 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 max = 127;
11992 squote = dquote = 0;
11993 ikind = PyUnicode_KIND(unicode);
11994 for (i = 0; i < isize; i++) {
11995 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11996 switch (ch) {
11997 case '\'': squote++; osize++; break;
11998 case '"': dquote++; osize++; break;
11999 case '\\': case '\t': case '\r': case '\n':
12000 osize += 2; break;
12001 default:
12002 /* Fast-path ASCII */
12003 if (ch < ' ' || ch == 0x7f)
12004 osize += 4; /* \xHH */
12005 else if (ch < 0x7f)
12006 osize++;
12007 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12008 osize++;
12009 max = ch > max ? ch : max;
12010 }
12011 else if (ch < 0x100)
12012 osize += 4; /* \xHH */
12013 else if (ch < 0x10000)
12014 osize += 6; /* \uHHHH */
12015 else
12016 osize += 10; /* \uHHHHHHHH */
12017 }
12018 }
12019
12020 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012021 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012023 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 if (dquote)
12025 /* Both squote and dquote present. Use squote,
12026 and escape them */
12027 osize += squote;
12028 else
12029 quote = '"';
12030 }
Victor Stinner55c08782013-04-14 18:45:39 +020012031 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032
12033 repr = PyUnicode_New(osize, max);
12034 if (repr == NULL)
12035 return NULL;
12036 okind = PyUnicode_KIND(repr);
12037 odata = PyUnicode_DATA(repr);
12038
12039 PyUnicode_WRITE(okind, odata, 0, quote);
12040 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012041 if (unchanged) {
12042 _PyUnicode_FastCopyCharacters(repr, 1,
12043 unicode, 0,
12044 isize);
12045 }
12046 else {
12047 for (i = 0, o = 1; i < isize; i++) {
12048 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049
Victor Stinner55c08782013-04-14 18:45:39 +020012050 /* Escape quotes and backslashes */
12051 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012052 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012054 continue;
12055 }
12056
12057 /* Map special whitespace to '\t', \n', '\r' */
12058 if (ch == '\t') {
12059 PyUnicode_WRITE(okind, odata, o++, '\\');
12060 PyUnicode_WRITE(okind, odata, o++, 't');
12061 }
12062 else if (ch == '\n') {
12063 PyUnicode_WRITE(okind, odata, o++, '\\');
12064 PyUnicode_WRITE(okind, odata, o++, 'n');
12065 }
12066 else if (ch == '\r') {
12067 PyUnicode_WRITE(okind, odata, o++, '\\');
12068 PyUnicode_WRITE(okind, odata, o++, 'r');
12069 }
12070
12071 /* Map non-printable US ASCII to '\xhh' */
12072 else if (ch < ' ' || ch == 0x7F) {
12073 PyUnicode_WRITE(okind, odata, o++, '\\');
12074 PyUnicode_WRITE(okind, odata, o++, 'x');
12075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12076 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12077 }
12078
12079 /* Copy ASCII characters as-is */
12080 else if (ch < 0x7F) {
12081 PyUnicode_WRITE(okind, odata, o++, ch);
12082 }
12083
12084 /* Non-ASCII characters */
12085 else {
12086 /* Map Unicode whitespace and control characters
12087 (categories Z* and C* except ASCII space)
12088 */
12089 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12090 PyUnicode_WRITE(okind, odata, o++, '\\');
12091 /* Map 8-bit characters to '\xhh' */
12092 if (ch <= 0xff) {
12093 PyUnicode_WRITE(okind, odata, o++, 'x');
12094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12095 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12096 }
12097 /* Map 16-bit characters to '\uxxxx' */
12098 else if (ch <= 0xffff) {
12099 PyUnicode_WRITE(okind, odata, o++, 'u');
12100 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12101 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12104 }
12105 /* Map 21-bit characters to '\U00xxxxxx' */
12106 else {
12107 PyUnicode_WRITE(okind, odata, o++, 'U');
12108 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12109 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12110 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12111 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12112 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12115 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12116 }
12117 }
12118 /* Copy characters as-is */
12119 else {
12120 PyUnicode_WRITE(okind, odata, o++, ch);
12121 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012122 }
12123 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012126 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012127 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128}
12129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012130PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132\n\
12133Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012134such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135arguments start and end are interpreted as in slice notation.\n\
12136\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012137Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138
12139static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012142 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012143 Py_ssize_t start;
12144 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012145 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
Jesus Ceaac451502011-04-20 17:09:23 +020012147 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12148 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 if (PyUnicode_READY(self) == -1)
12152 return NULL;
12153 if (PyUnicode_READY(substring) == -1)
12154 return NULL;
12155
Victor Stinner7931d9a2011-11-04 00:22:48 +010012156 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157
12158 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 if (result == -2)
12161 return NULL;
12162
Christian Heimes217cfd12007-12-02 14:31:20 +000012163 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164}
12165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012166PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012167 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012169Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
12171static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012174 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012175 Py_ssize_t start;
12176 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012177 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
Jesus Ceaac451502011-04-20 17:09:23 +020012179 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12180 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (PyUnicode_READY(self) == -1)
12184 return NULL;
12185 if (PyUnicode_READY(substring) == -1)
12186 return NULL;
12187
Victor Stinner7931d9a2011-11-04 00:22:48 +010012188 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
12190 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 if (result == -2)
12193 return NULL;
12194
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195 if (result < 0) {
12196 PyErr_SetString(PyExc_ValueError, "substring not found");
12197 return NULL;
12198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199
Christian Heimes217cfd12007-12-02 14:31:20 +000012200 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201}
12202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012203PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012204 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012206Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012207done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
12209static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012210unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012212 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 Py_UCS4 fillchar = ' ';
12214
Victor Stinnere9a29352011-10-01 02:14:59 +020012215 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012217
Benjamin Petersonbac79492012-01-14 13:34:47 -050012218 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219 return NULL;
12220
Victor Stinnerc4b49542011-12-11 22:44:26 +010012221 if (PyUnicode_GET_LENGTH(self) >= width)
12222 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223
Victor Stinnerc4b49542011-12-11 22:44:26 +010012224 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225}
12226
Alexander Belopolsky40018472011-02-26 01:02:56 +000012227PyObject *
12228PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229{
12230 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012231
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 s = PyUnicode_FromObject(s);
12233 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012234 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012235 if (sep != NULL) {
12236 sep = PyUnicode_FromObject(sep);
12237 if (sep == NULL) {
12238 Py_DECREF(s);
12239 return NULL;
12240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241 }
12242
Victor Stinner9310abb2011-10-05 00:59:23 +020012243 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244
12245 Py_DECREF(s);
12246 Py_XDECREF(sep);
12247 return result;
12248}
12249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012250PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012251 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252\n\
12253Return a list of the words in S, using sep as the\n\
12254delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012255splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012256whitespace string is a separator and empty strings are\n\
12257removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258
12259static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012260unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012262 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012264 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012266 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12267 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 return NULL;
12269
12270 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012273 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012275 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276}
12277
Thomas Wouters477c8d52006-05-27 19:21:47 +000012278PyObject *
12279PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12280{
12281 PyObject* str_obj;
12282 PyObject* sep_obj;
12283 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 int kind1, kind2, kind;
12285 void *buf1 = NULL, *buf2 = NULL;
12286 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012287
12288 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012289 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012290 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012292 if (!sep_obj) {
12293 Py_DECREF(str_obj);
12294 return NULL;
12295 }
12296 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12297 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012298 Py_DECREF(str_obj);
12299 return NULL;
12300 }
12301
Victor Stinner14f8f022011-10-05 20:58:25 +020012302 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012304 kind = Py_MAX(kind1, kind2);
12305 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012307 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 if (!buf1)
12309 goto onError;
12310 buf2 = PyUnicode_DATA(sep_obj);
12311 if (kind2 != kind)
12312 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12313 if (!buf2)
12314 goto onError;
12315 len1 = PyUnicode_GET_LENGTH(str_obj);
12316 len2 = PyUnicode_GET_LENGTH(sep_obj);
12317
Benjamin Petersonead6b532011-12-20 17:23:42 -060012318 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012320 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12321 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12322 else
12323 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 break;
12325 case PyUnicode_2BYTE_KIND:
12326 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12327 break;
12328 case PyUnicode_4BYTE_KIND:
12329 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12330 break;
12331 default:
12332 assert(0);
12333 out = 0;
12334 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012335
12336 Py_DECREF(sep_obj);
12337 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 if (kind1 != kind)
12339 PyMem_Free(buf1);
12340 if (kind2 != kind)
12341 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012342
12343 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 onError:
12345 Py_DECREF(sep_obj);
12346 Py_DECREF(str_obj);
12347 if (kind1 != kind && buf1)
12348 PyMem_Free(buf1);
12349 if (kind2 != kind && buf2)
12350 PyMem_Free(buf2);
12351 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012352}
12353
12354
12355PyObject *
12356PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12357{
12358 PyObject* str_obj;
12359 PyObject* sep_obj;
12360 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 int kind1, kind2, kind;
12362 void *buf1 = NULL, *buf2 = NULL;
12363 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012364
12365 str_obj = PyUnicode_FromObject(str_in);
12366 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012367 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012368 sep_obj = PyUnicode_FromObject(sep_in);
12369 if (!sep_obj) {
12370 Py_DECREF(str_obj);
12371 return NULL;
12372 }
12373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 kind1 = PyUnicode_KIND(str_in);
12375 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012376 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 buf1 = PyUnicode_DATA(str_in);
12378 if (kind1 != kind)
12379 buf1 = _PyUnicode_AsKind(str_in, kind);
12380 if (!buf1)
12381 goto onError;
12382 buf2 = PyUnicode_DATA(sep_obj);
12383 if (kind2 != kind)
12384 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12385 if (!buf2)
12386 goto onError;
12387 len1 = PyUnicode_GET_LENGTH(str_obj);
12388 len2 = PyUnicode_GET_LENGTH(sep_obj);
12389
Benjamin Petersonead6b532011-12-20 17:23:42 -060012390 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012392 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12393 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12394 else
12395 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 break;
12397 case PyUnicode_2BYTE_KIND:
12398 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12399 break;
12400 case PyUnicode_4BYTE_KIND:
12401 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12402 break;
12403 default:
12404 assert(0);
12405 out = 0;
12406 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012407
12408 Py_DECREF(sep_obj);
12409 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 if (kind1 != kind)
12411 PyMem_Free(buf1);
12412 if (kind2 != kind)
12413 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012414
12415 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 onError:
12417 Py_DECREF(sep_obj);
12418 Py_DECREF(str_obj);
12419 if (kind1 != kind && buf1)
12420 PyMem_Free(buf1);
12421 if (kind2 != kind && buf2)
12422 PyMem_Free(buf2);
12423 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012424}
12425
12426PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012427 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012428\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012429Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012430the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012431found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012432
12433static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012434unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012435{
Victor Stinner9310abb2011-10-05 00:59:23 +020012436 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012437}
12438
12439PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012440 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012441\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012442Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012444separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445
12446static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012447unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012448{
Victor Stinner9310abb2011-10-05 00:59:23 +020012449 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012450}
12451
Alexander Belopolsky40018472011-02-26 01:02:56 +000012452PyObject *
12453PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012454{
12455 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012456
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012457 s = PyUnicode_FromObject(s);
12458 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 if (sep != NULL) {
12461 sep = PyUnicode_FromObject(sep);
12462 if (sep == NULL) {
12463 Py_DECREF(s);
12464 return NULL;
12465 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012466 }
12467
Victor Stinner9310abb2011-10-05 00:59:23 +020012468 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012469
12470 Py_DECREF(s);
12471 Py_XDECREF(sep);
12472 return result;
12473}
12474
12475PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012476 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012477\n\
12478Return a list of the words in S, using sep as the\n\
12479delimiter string, starting at the end of the string and\n\
12480working to the front. If maxsplit is given, at most maxsplit\n\
12481splits are done. If sep is not specified, any whitespace string\n\
12482is a separator.");
12483
12484static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012485unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012486{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012487 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012488 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012489 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012490
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012491 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12492 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012493 return NULL;
12494
12495 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012496 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012497 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012498 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012499 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012500 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012501}
12502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012503PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505\n\
12506Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012507Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012508is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
12510static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012511unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012513 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012514 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012516 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12517 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518 return NULL;
12519
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012520 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521}
12522
12523static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012524PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012526 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527}
12528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012529PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531\n\
12532Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012533and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534
12535static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012536unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012538 if (PyUnicode_READY(self) == -1)
12539 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012540 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541}
12542
Georg Brandlceee0772007-11-27 23:48:05 +000012543PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012544 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012545\n\
12546Return a translation table usable for str.translate().\n\
12547If there is only one argument, it must be a dictionary mapping Unicode\n\
12548ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012549Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012550If there are two arguments, they must be strings of equal length, and\n\
12551in the resulting dictionary, each character in x will be mapped to the\n\
12552character at the same position in y. If there is a third argument, it\n\
12553must be a string, whose characters will be mapped to None in the result.");
12554
12555static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012556unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012557{
12558 PyObject *x, *y = NULL, *z = NULL;
12559 PyObject *new = NULL, *key, *value;
12560 Py_ssize_t i = 0;
12561 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012562
Georg Brandlceee0772007-11-27 23:48:05 +000012563 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12564 return NULL;
12565 new = PyDict_New();
12566 if (!new)
12567 return NULL;
12568 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 int x_kind, y_kind, z_kind;
12570 void *x_data, *y_data, *z_data;
12571
Georg Brandlceee0772007-11-27 23:48:05 +000012572 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012573 if (!PyUnicode_Check(x)) {
12574 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12575 "be a string if there is a second argument");
12576 goto err;
12577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012579 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12580 "arguments must have equal length");
12581 goto err;
12582 }
12583 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 x_kind = PyUnicode_KIND(x);
12585 y_kind = PyUnicode_KIND(y);
12586 x_data = PyUnicode_DATA(x);
12587 y_data = PyUnicode_DATA(y);
12588 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12589 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012590 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012591 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012592 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012593 if (!value) {
12594 Py_DECREF(key);
12595 goto err;
12596 }
Georg Brandlceee0772007-11-27 23:48:05 +000012597 res = PyDict_SetItem(new, key, value);
12598 Py_DECREF(key);
12599 Py_DECREF(value);
12600 if (res < 0)
12601 goto err;
12602 }
12603 /* create entries for deleting chars in z */
12604 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 z_kind = PyUnicode_KIND(z);
12606 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012607 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012609 if (!key)
12610 goto err;
12611 res = PyDict_SetItem(new, key, Py_None);
12612 Py_DECREF(key);
12613 if (res < 0)
12614 goto err;
12615 }
12616 }
12617 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 int kind;
12619 void *data;
12620
Georg Brandlceee0772007-11-27 23:48:05 +000012621 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012622 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012623 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12624 "to maketrans it must be a dict");
12625 goto err;
12626 }
12627 /* copy entries into the new dict, converting string keys to int keys */
12628 while (PyDict_Next(x, &i, &key, &value)) {
12629 if (PyUnicode_Check(key)) {
12630 /* convert string keys to integer keys */
12631 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012632 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012633 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12634 "table must be of length 1");
12635 goto err;
12636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 kind = PyUnicode_KIND(key);
12638 data = PyUnicode_DATA(key);
12639 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012640 if (!newkey)
12641 goto err;
12642 res = PyDict_SetItem(new, newkey, value);
12643 Py_DECREF(newkey);
12644 if (res < 0)
12645 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012646 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012647 /* just keep integer keys */
12648 if (PyDict_SetItem(new, key, value) < 0)
12649 goto err;
12650 } else {
12651 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12652 "be strings or integers");
12653 goto err;
12654 }
12655 }
12656 }
12657 return new;
12658 err:
12659 Py_DECREF(new);
12660 return NULL;
12661}
12662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012663PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012664 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665\n\
12666Return a copy of the string S, where all characters have been mapped\n\
12667through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012668Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012669Unmapped characters are left untouched. Characters mapped to None\n\
12670are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671
12672static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676}
12677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012678PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012681Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682
12683static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012684unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012686 if (PyUnicode_READY(self) == -1)
12687 return NULL;
12688 if (PyUnicode_IS_ASCII(self))
12689 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012690 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691}
12692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012693PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012694 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012696Pad a numeric string S with zeros on the left, to fill a field\n\
12697of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698
12699static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012700unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012702 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012703 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012704 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 int kind;
12706 void *data;
12707 Py_UCS4 chr;
12708
Martin v. Löwis18e16552006-02-15 17:27:45 +000012709 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710 return NULL;
12711
Benjamin Petersonbac79492012-01-14 13:34:47 -050012712 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714
Victor Stinnerc4b49542011-12-11 22:44:26 +010012715 if (PyUnicode_GET_LENGTH(self) >= width)
12716 return unicode_result_unchanged(self);
12717
12718 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719
12720 u = pad(self, fill, 0, '0');
12721
Walter Dörwald068325e2002-04-15 13:36:47 +000012722 if (u == NULL)
12723 return NULL;
12724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 kind = PyUnicode_KIND(u);
12726 data = PyUnicode_DATA(u);
12727 chr = PyUnicode_READ(kind, data, fill);
12728
12729 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 PyUnicode_WRITE(kind, data, 0, chr);
12732 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733 }
12734
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012735 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012736 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738
12739#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012740static PyObject *
12741unicode__decimal2ascii(PyObject *self)
12742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012744}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745#endif
12746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012747PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012750Return True if S starts with the specified prefix, False otherwise.\n\
12751With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012752With optional end, stop comparing S at that position.\n\
12753prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012754
12755static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012756unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012759 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012760 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012761 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012762 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012763 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764
Jesus Ceaac451502011-04-20 17:09:23 +020012765 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767 if (PyTuple_Check(subobj)) {
12768 Py_ssize_t i;
12769 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012770 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012771 if (substring == NULL)
12772 return NULL;
12773 result = tailmatch(self, substring, start, end, -1);
12774 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012775 if (result == -1)
12776 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012777 if (result) {
12778 Py_RETURN_TRUE;
12779 }
12780 }
12781 /* nothing matched */
12782 Py_RETURN_FALSE;
12783 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012784 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012785 if (substring == NULL) {
12786 if (PyErr_ExceptionMatches(PyExc_TypeError))
12787 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12788 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012790 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012791 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012793 if (result == -1)
12794 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012795 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796}
12797
12798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012799PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012802Return True if S ends with the specified suffix, False otherwise.\n\
12803With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012804With optional end, stop comparing S at that position.\n\
12805suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806
12807static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012808unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012811 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012812 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012813 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012814 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012815 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
Jesus Ceaac451502011-04-20 17:09:23 +020012817 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012819 if (PyTuple_Check(subobj)) {
12820 Py_ssize_t i;
12821 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012822 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012823 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012824 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012826 result = tailmatch(self, substring, start, end, +1);
12827 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012828 if (result == -1)
12829 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012830 if (result) {
12831 Py_RETURN_TRUE;
12832 }
12833 }
12834 Py_RETURN_FALSE;
12835 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012836 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012837 if (substring == NULL) {
12838 if (PyErr_ExceptionMatches(PyExc_TypeError))
12839 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12840 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012841 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012842 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012843 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010012844 if (result == -1)
12845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012847 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848}
12849
Victor Stinner202fdca2012-05-07 12:47:02 +020012850Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012851_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012852{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012853 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012854 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12855 writer->data = PyUnicode_DATA(writer->buffer);
12856 writer->kind = PyUnicode_KIND(writer->buffer);
12857}
12858
Victor Stinnerd3f08822012-05-29 12:57:52 +020012859void
12860_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012861{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012862 memset(writer, 0, sizeof(*writer));
12863#ifdef Py_DEBUG
12864 writer->kind = 5; /* invalid kind */
12865#endif
12866 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012867 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012868}
12869
Victor Stinnerd3f08822012-05-29 12:57:52 +020012870int
12871_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12872 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012873{
12874 Py_ssize_t newlen;
12875 PyObject *newbuffer;
12876
Victor Stinnerd3f08822012-05-29 12:57:52 +020012877 assert(length > 0);
12878
Victor Stinner202fdca2012-05-07 12:47:02 +020012879 if (length > PY_SSIZE_T_MAX - writer->pos) {
12880 PyErr_NoMemory();
12881 return -1;
12882 }
12883 newlen = writer->pos + length;
12884
Victor Stinnerd3f08822012-05-29 12:57:52 +020012885 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012886 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012887 /* overallocate 25% to limit the number of resize */
12888 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12889 newlen += newlen / 4;
12890 if (newlen < writer->min_length)
12891 newlen = writer->min_length;
12892 }
12893 writer->buffer = PyUnicode_New(newlen, maxchar);
12894 if (writer->buffer == NULL)
12895 return -1;
12896 _PyUnicodeWriter_Update(writer);
12897 return 0;
12898 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012899
Victor Stinnerd3f08822012-05-29 12:57:52 +020012900 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012901 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012902 /* overallocate 25% to limit the number of resize */
12903 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12904 newlen += newlen / 4;
12905 if (newlen < writer->min_length)
12906 newlen = writer->min_length;
12907 }
12908
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012909 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012910 /* resize + widen */
12911 newbuffer = PyUnicode_New(newlen, maxchar);
12912 if (newbuffer == NULL)
12913 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012914 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12915 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012916 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012917 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012918 }
12919 else {
12920 newbuffer = resize_compact(writer->buffer, newlen);
12921 if (newbuffer == NULL)
12922 return -1;
12923 }
12924 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012925 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012926 }
12927 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012928 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012929 newbuffer = PyUnicode_New(writer->size, maxchar);
12930 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012931 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012932 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12933 writer->buffer, 0, writer->pos);
12934 Py_DECREF(writer->buffer);
12935 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012936 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012937 }
12938 return 0;
12939}
12940
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020012941Py_LOCAL_INLINE(int)
12942_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020012943{
12944 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
12945 return -1;
12946 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
12947 writer->pos++;
12948 return 0;
12949}
12950
12951int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020012952_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
12953{
12954 return _PyUnicodeWriter_WriteCharInline(writer, ch);
12955}
12956
12957int
Victor Stinnerd3f08822012-05-29 12:57:52 +020012958_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12959{
12960 Py_UCS4 maxchar;
12961 Py_ssize_t len;
12962
12963 if (PyUnicode_READY(str) == -1)
12964 return -1;
12965 len = PyUnicode_GET_LENGTH(str);
12966 if (len == 0)
12967 return 0;
12968 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12969 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012970 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012971 Py_INCREF(str);
12972 writer->buffer = str;
12973 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012974 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012975 writer->size = 0;
12976 writer->pos += len;
12977 return 0;
12978 }
12979 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12980 return -1;
12981 }
12982 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12983 str, 0, len);
12984 writer->pos += len;
12985 return 0;
12986}
12987
Victor Stinnere215d962012-10-06 23:03:36 +020012988int
Victor Stinnercfc4c132013-04-03 01:48:39 +020012989_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
12990 Py_ssize_t start, Py_ssize_t end)
12991{
12992 Py_UCS4 maxchar;
12993 Py_ssize_t len;
12994
12995 if (PyUnicode_READY(str) == -1)
12996 return -1;
12997
12998 assert(0 <= start);
12999 assert(end <= PyUnicode_GET_LENGTH(str));
13000 assert(start <= end);
13001
13002 if (end == 0)
13003 return 0;
13004
13005 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13006 return _PyUnicodeWriter_WriteStr(writer, str);
13007
13008 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13009 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13010 else
13011 maxchar = writer->maxchar;
13012 len = end - start;
13013
13014 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13015 return -1;
13016
13017 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13018 str, start, len);
13019 writer->pos += len;
13020 return 0;
13021}
13022
13023int
Victor Stinnere215d962012-10-06 23:03:36 +020013024_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13025{
13026 Py_UCS4 maxchar;
13027
13028 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13029 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13030 return -1;
13031 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13032 writer->pos += len;
13033 return 0;
13034}
13035
Victor Stinnerd3f08822012-05-29 12:57:52 +020013036PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013037_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013038{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013039 if (writer->pos == 0) {
13040 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013041 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013042 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013043 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020013044 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
13045 return writer->buffer;
13046 }
13047 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13048 PyObject *newbuffer;
13049 newbuffer = resize_compact(writer->buffer, writer->pos);
13050 if (newbuffer == NULL) {
13051 Py_DECREF(writer->buffer);
13052 return NULL;
13053 }
13054 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013055 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020013056 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010013057 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013058}
13059
Victor Stinnerd3f08822012-05-29 12:57:52 +020013060void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013061_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013062{
13063 Py_CLEAR(writer->buffer);
13064}
13065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013067
13068PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013070\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013071Return a formatted version of S, using substitutions from args and kwargs.\n\
13072The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013073
Eric Smith27bbca62010-11-04 17:06:58 +000013074PyDoc_STRVAR(format_map__doc__,
13075 "S.format_map(mapping) -> str\n\
13076\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013077Return a formatted version of S, using substitutions from mapping.\n\
13078The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013079
Eric Smith4a7d76d2008-05-30 18:10:19 +000013080static PyObject *
13081unicode__format__(PyObject* self, PyObject* args)
13082{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013083 PyObject *format_spec;
13084 _PyUnicodeWriter writer;
13085 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013086
13087 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13088 return NULL;
13089
Victor Stinnerd3f08822012-05-29 12:57:52 +020013090 if (PyUnicode_READY(self) == -1)
13091 return NULL;
13092 _PyUnicodeWriter_Init(&writer, 0);
13093 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13094 self, format_spec, 0,
13095 PyUnicode_GET_LENGTH(format_spec));
13096 if (ret == -1) {
13097 _PyUnicodeWriter_Dealloc(&writer);
13098 return NULL;
13099 }
13100 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013101}
13102
Eric Smith8c663262007-08-25 02:26:07 +000013103PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013105\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013106Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013107
13108static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013109unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 Py_ssize_t size;
13112
13113 /* If it's a compact object, account for base structure +
13114 character data. */
13115 if (PyUnicode_IS_COMPACT_ASCII(v))
13116 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13117 else if (PyUnicode_IS_COMPACT(v))
13118 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013119 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 else {
13121 /* If it is a two-block object, account for base object, and
13122 for character block if present. */
13123 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013124 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013126 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 }
13128 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013129 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013130 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013132 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013133 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134
13135 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013136}
13137
13138PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013140
13141static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013142unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013143{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013144 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 if (!copy)
13146 return NULL;
13147 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013148}
13149
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013151 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013152 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013153 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13154 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013155 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13156 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013157 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013158 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13159 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13160 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13161 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13162 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013163 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013164 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13165 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13166 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013167 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013168 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13169 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13170 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013171 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013172 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013173 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013174 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013175 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13176 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13177 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13178 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13179 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13180 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13181 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13182 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13183 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13184 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13185 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13186 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13187 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13188 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013189 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013190 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013191 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013192 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013193 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013194 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013195 {"maketrans", (PyCFunction) unicode_maketrans,
13196 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013197 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013198#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013199 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013200 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201#endif
13202
Benjamin Peterson14339b62009-01-31 16:36:08 +000013203 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204 {NULL, NULL}
13205};
13206
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013207static PyObject *
13208unicode_mod(PyObject *v, PyObject *w)
13209{
Brian Curtindfc80e32011-08-10 20:28:54 -050013210 if (!PyUnicode_Check(v))
13211 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013213}
13214
13215static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013216 0, /*nb_add*/
13217 0, /*nb_subtract*/
13218 0, /*nb_multiply*/
13219 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013220};
13221
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013223 (lenfunc) unicode_length, /* sq_length */
13224 PyUnicode_Concat, /* sq_concat */
13225 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13226 (ssizeargfunc) unicode_getitem, /* sq_item */
13227 0, /* sq_slice */
13228 0, /* sq_ass_item */
13229 0, /* sq_ass_slice */
13230 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231};
13232
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013233static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013234unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 if (PyUnicode_READY(self) == -1)
13237 return NULL;
13238
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013239 if (PyIndex_Check(item)) {
13240 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013241 if (i == -1 && PyErr_Occurred())
13242 return NULL;
13243 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013244 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013245 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013246 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013247 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013248 PyObject *result;
13249 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013250 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013251 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013253 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013254 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013255 return NULL;
13256 }
13257
13258 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013259 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013261 slicelength == PyUnicode_GET_LENGTH(self)) {
13262 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013263 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013264 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013265 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013266 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013267 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013268 src_kind = PyUnicode_KIND(self);
13269 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013270 if (!PyUnicode_IS_ASCII(self)) {
13271 kind_limit = kind_maxchar_limit(src_kind);
13272 max_char = 0;
13273 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13274 ch = PyUnicode_READ(src_kind, src_data, cur);
13275 if (ch > max_char) {
13276 max_char = ch;
13277 if (max_char >= kind_limit)
13278 break;
13279 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013280 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013281 }
Victor Stinner55c99112011-10-13 01:17:06 +020013282 else
13283 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013284 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013285 if (result == NULL)
13286 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013287 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013288 dest_data = PyUnicode_DATA(result);
13289
13290 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013291 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13292 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013293 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013294 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013295 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013296 } else {
13297 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13298 return NULL;
13299 }
13300}
13301
13302static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013303 (lenfunc)unicode_length, /* mp_length */
13304 (binaryfunc)unicode_subscript, /* mp_subscript */
13305 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013306};
13307
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309/* Helpers for PyUnicode_Format() */
13310
Victor Stinnera47082312012-10-04 02:19:54 +020013311struct unicode_formatter_t {
13312 PyObject *args;
13313 int args_owned;
13314 Py_ssize_t arglen, argidx;
13315 PyObject *dict;
13316
13317 enum PyUnicode_Kind fmtkind;
13318 Py_ssize_t fmtcnt, fmtpos;
13319 void *fmtdata;
13320 PyObject *fmtstr;
13321
13322 _PyUnicodeWriter writer;
13323};
13324
13325struct unicode_format_arg_t {
13326 Py_UCS4 ch;
13327 int flags;
13328 Py_ssize_t width;
13329 int prec;
13330 int sign;
13331};
13332
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013334unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335{
Victor Stinnera47082312012-10-04 02:19:54 +020013336 Py_ssize_t argidx = ctx->argidx;
13337
13338 if (argidx < ctx->arglen) {
13339 ctx->argidx++;
13340 if (ctx->arglen < 0)
13341 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013342 else
Victor Stinnera47082312012-10-04 02:19:54 +020013343 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344 }
13345 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347 return NULL;
13348}
13349
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013350/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351
Victor Stinnera47082312012-10-04 02:19:54 +020013352/* Format a float into the writer if the writer is not NULL, or into *p_output
13353 otherwise.
13354
13355 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013356static int
Victor Stinnera47082312012-10-04 02:19:54 +020013357formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13358 PyObject **p_output,
13359 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013361 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013363 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013364 int prec;
13365 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013366
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367 x = PyFloat_AsDouble(v);
13368 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013369 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013370
Victor Stinnera47082312012-10-04 02:19:54 +020013371 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013374
Victor Stinnera47082312012-10-04 02:19:54 +020013375 if (arg->flags & F_ALT)
13376 dtoa_flags = Py_DTSF_ALT;
13377 else
13378 dtoa_flags = 0;
13379 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013380 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013381 return -1;
13382 len = strlen(p);
13383 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013384 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13385 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013386 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013387 }
Victor Stinner184252a2012-06-16 02:57:41 +020013388 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013389 writer->pos += len;
13390 }
13391 else
13392 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013393 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013394 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395}
13396
Victor Stinnerd0880d52012-04-27 23:40:13 +020013397/* formatlong() emulates the format codes d, u, o, x and X, and
13398 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13399 * Python's regular ints.
13400 * Return value: a new PyUnicodeObject*, or NULL if error.
13401 * The output string is of the form
13402 * "-"? ("0x" | "0X")? digit+
13403 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13404 * set in flags. The case of hex digits will be correct,
13405 * There will be at least prec digits, zero-filled on the left if
13406 * necessary to get that many.
13407 * val object to be converted
13408 * flags bitmask of format flags; only F_ALT is looked at
13409 * prec minimum number of digits; 0-fill on left if needed
13410 * type a character in [duoxX]; u acts the same as d
13411 *
13412 * CAUTION: o, x and X conversions on regular ints can never
13413 * produce a '-' sign, but can for Python's unbounded ints.
13414 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013415static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013416formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013417{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013418 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013419 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013420 Py_ssize_t i;
13421 int sign; /* 1 if '-', else 0 */
13422 int len; /* number of characters */
13423 Py_ssize_t llen;
13424 int numdigits; /* len == numnondigits + numdigits */
13425 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013426 int prec = arg->prec;
13427 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013428
Victor Stinnerd0880d52012-04-27 23:40:13 +020013429 /* Avoid exceeding SSIZE_T_MAX */
13430 if (prec > INT_MAX-3) {
13431 PyErr_SetString(PyExc_OverflowError,
13432 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013433 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013434 }
13435
13436 assert(PyLong_Check(val));
13437
13438 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013439 default:
13440 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013441 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013442 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013443 case 'u':
13444 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013445 if (PyBool_Check(val))
13446 result = PyNumber_ToBase(val, 10);
13447 else
13448 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013449 break;
13450 case 'o':
13451 numnondigits = 2;
13452 result = PyNumber_ToBase(val, 8);
13453 break;
13454 case 'x':
13455 case 'X':
13456 numnondigits = 2;
13457 result = PyNumber_ToBase(val, 16);
13458 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013459 }
13460 if (!result)
13461 return NULL;
13462
13463 assert(unicode_modifiable(result));
13464 assert(PyUnicode_IS_READY(result));
13465 assert(PyUnicode_IS_ASCII(result));
13466
13467 /* To modify the string in-place, there can only be one reference. */
13468 if (Py_REFCNT(result) != 1) {
13469 PyErr_BadInternalCall();
13470 return NULL;
13471 }
13472 buf = PyUnicode_DATA(result);
13473 llen = PyUnicode_GET_LENGTH(result);
13474 if (llen > INT_MAX) {
13475 PyErr_SetString(PyExc_ValueError,
13476 "string too large in _PyBytes_FormatLong");
13477 return NULL;
13478 }
13479 len = (int)llen;
13480 sign = buf[0] == '-';
13481 numnondigits += sign;
13482 numdigits = len - numnondigits;
13483 assert(numdigits > 0);
13484
13485 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013486 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013487 (type == 'o' || type == 'x' || type == 'X'))) {
13488 assert(buf[sign] == '0');
13489 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13490 buf[sign+1] == 'o');
13491 numnondigits -= 2;
13492 buf += 2;
13493 len -= 2;
13494 if (sign)
13495 buf[0] = '-';
13496 assert(len == numnondigits + numdigits);
13497 assert(numdigits > 0);
13498 }
13499
13500 /* Fill with leading zeroes to meet minimum width. */
13501 if (prec > numdigits) {
13502 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13503 numnondigits + prec);
13504 char *b1;
13505 if (!r1) {
13506 Py_DECREF(result);
13507 return NULL;
13508 }
13509 b1 = PyBytes_AS_STRING(r1);
13510 for (i = 0; i < numnondigits; ++i)
13511 *b1++ = *buf++;
13512 for (i = 0; i < prec - numdigits; i++)
13513 *b1++ = '0';
13514 for (i = 0; i < numdigits; i++)
13515 *b1++ = *buf++;
13516 *b1 = '\0';
13517 Py_DECREF(result);
13518 result = r1;
13519 buf = PyBytes_AS_STRING(result);
13520 len = numnondigits + prec;
13521 }
13522
13523 /* Fix up case for hex conversions. */
13524 if (type == 'X') {
13525 /* Need to convert all lower case letters to upper case.
13526 and need to convert 0x to 0X (and -0x to -0X). */
13527 for (i = 0; i < len; i++)
13528 if (buf[i] >= 'a' && buf[i] <= 'x')
13529 buf[i] -= 'a'-'A';
13530 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013531 if (!PyUnicode_Check(result)
13532 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013533 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013534 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013535 Py_DECREF(result);
13536 result = unicode;
13537 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013538 else if (len != PyUnicode_GET_LENGTH(result)) {
13539 if (PyUnicode_Resize(&result, len) < 0)
13540 Py_CLEAR(result);
13541 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013542 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013543}
13544
Victor Stinner621ef3d2012-10-02 00:33:47 +020013545/* Format an integer.
13546 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013547 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013548 * -1 and raise an exception on error */
13549static int
Victor Stinnera47082312012-10-04 02:19:54 +020013550mainformatlong(PyObject *v,
13551 struct unicode_format_arg_t *arg,
13552 PyObject **p_output,
13553 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013554{
13555 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013556 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013557
13558 if (!PyNumber_Check(v))
13559 goto wrongtype;
13560
13561 if (!PyLong_Check(v)) {
13562 iobj = PyNumber_Long(v);
13563 if (iobj == NULL) {
13564 if (PyErr_ExceptionMatches(PyExc_TypeError))
13565 goto wrongtype;
13566 return -1;
13567 }
13568 assert(PyLong_Check(iobj));
13569 }
13570 else {
13571 iobj = v;
13572 Py_INCREF(iobj);
13573 }
13574
13575 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013576 && arg->width == -1 && arg->prec == -1
13577 && !(arg->flags & (F_SIGN | F_BLANK))
13578 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013579 {
13580 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013581 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013582 int base;
13583
Victor Stinnera47082312012-10-04 02:19:54 +020013584 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013585 {
13586 default:
13587 assert(0 && "'type' not in [diuoxX]");
13588 case 'd':
13589 case 'i':
13590 case 'u':
13591 base = 10;
13592 break;
13593 case 'o':
13594 base = 8;
13595 break;
13596 case 'x':
13597 case 'X':
13598 base = 16;
13599 break;
13600 }
13601
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013602 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13603 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013604 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013605 }
13606 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013607 return 1;
13608 }
13609
Victor Stinnera47082312012-10-04 02:19:54 +020013610 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013611 Py_DECREF(iobj);
13612 if (res == NULL)
13613 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013614 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013615 return 0;
13616
13617wrongtype:
13618 PyErr_Format(PyExc_TypeError,
13619 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013620 "not %.200s",
13621 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013622 return -1;
13623}
13624
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013625static Py_UCS4
13626formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013627{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013628 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013629 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013630 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013631 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013632 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 goto onError;
13634 }
13635 else {
13636 /* Integer input truncated to a character */
13637 long x;
13638 x = PyLong_AsLong(v);
13639 if (x == -1 && PyErr_Occurred())
13640 goto onError;
13641
Victor Stinner8faf8212011-12-08 22:14:11 +010013642 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013643 PyErr_SetString(PyExc_OverflowError,
13644 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013645 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013646 }
13647
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013648 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013649 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013650
Benjamin Peterson29060642009-01-31 22:14:21 +000013651 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013652 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013654 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013655}
13656
Victor Stinnera47082312012-10-04 02:19:54 +020013657/* Parse options of an argument: flags, width, precision.
13658 Handle also "%(name)" syntax.
13659
13660 Return 0 if the argument has been formatted into arg->str.
13661 Return 1 if the argument has been written into ctx->writer,
13662 Raise an exception and return -1 on error. */
13663static int
13664unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13665 struct unicode_format_arg_t *arg)
13666{
13667#define FORMAT_READ(ctx) \
13668 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13669
13670 PyObject *v;
13671
Victor Stinnera47082312012-10-04 02:19:54 +020013672 if (arg->ch == '(') {
13673 /* Get argument value from a dictionary. Example: "%(name)s". */
13674 Py_ssize_t keystart;
13675 Py_ssize_t keylen;
13676 PyObject *key;
13677 int pcount = 1;
13678
13679 if (ctx->dict == NULL) {
13680 PyErr_SetString(PyExc_TypeError,
13681 "format requires a mapping");
13682 return -1;
13683 }
13684 ++ctx->fmtpos;
13685 --ctx->fmtcnt;
13686 keystart = ctx->fmtpos;
13687 /* Skip over balanced parentheses */
13688 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13689 arg->ch = FORMAT_READ(ctx);
13690 if (arg->ch == ')')
13691 --pcount;
13692 else if (arg->ch == '(')
13693 ++pcount;
13694 ctx->fmtpos++;
13695 }
13696 keylen = ctx->fmtpos - keystart - 1;
13697 if (ctx->fmtcnt < 0 || pcount > 0) {
13698 PyErr_SetString(PyExc_ValueError,
13699 "incomplete format key");
13700 return -1;
13701 }
13702 key = PyUnicode_Substring(ctx->fmtstr,
13703 keystart, keystart + keylen);
13704 if (key == NULL)
13705 return -1;
13706 if (ctx->args_owned) {
13707 Py_DECREF(ctx->args);
13708 ctx->args_owned = 0;
13709 }
13710 ctx->args = PyObject_GetItem(ctx->dict, key);
13711 Py_DECREF(key);
13712 if (ctx->args == NULL)
13713 return -1;
13714 ctx->args_owned = 1;
13715 ctx->arglen = -1;
13716 ctx->argidx = -2;
13717 }
13718
13719 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013720 while (--ctx->fmtcnt >= 0) {
13721 arg->ch = FORMAT_READ(ctx);
13722 ctx->fmtpos++;
13723 switch (arg->ch) {
13724 case '-': arg->flags |= F_LJUST; continue;
13725 case '+': arg->flags |= F_SIGN; continue;
13726 case ' ': arg->flags |= F_BLANK; continue;
13727 case '#': arg->flags |= F_ALT; continue;
13728 case '0': arg->flags |= F_ZERO; continue;
13729 }
13730 break;
13731 }
13732
13733 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013734 if (arg->ch == '*') {
13735 v = unicode_format_getnextarg(ctx);
13736 if (v == NULL)
13737 return -1;
13738 if (!PyLong_Check(v)) {
13739 PyErr_SetString(PyExc_TypeError,
13740 "* wants int");
13741 return -1;
13742 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013743 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013744 if (arg->width == -1 && PyErr_Occurred())
13745 return -1;
13746 if (arg->width < 0) {
13747 arg->flags |= F_LJUST;
13748 arg->width = -arg->width;
13749 }
13750 if (--ctx->fmtcnt >= 0) {
13751 arg->ch = FORMAT_READ(ctx);
13752 ctx->fmtpos++;
13753 }
13754 }
13755 else if (arg->ch >= '0' && arg->ch <= '9') {
13756 arg->width = arg->ch - '0';
13757 while (--ctx->fmtcnt >= 0) {
13758 arg->ch = FORMAT_READ(ctx);
13759 ctx->fmtpos++;
13760 if (arg->ch < '0' || arg->ch > '9')
13761 break;
13762 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13763 mixing signed and unsigned comparison. Since arg->ch is between
13764 '0' and '9', casting to int is safe. */
13765 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13766 PyErr_SetString(PyExc_ValueError,
13767 "width too big");
13768 return -1;
13769 }
13770 arg->width = arg->width*10 + (arg->ch - '0');
13771 }
13772 }
13773
13774 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013775 if (arg->ch == '.') {
13776 arg->prec = 0;
13777 if (--ctx->fmtcnt >= 0) {
13778 arg->ch = FORMAT_READ(ctx);
13779 ctx->fmtpos++;
13780 }
13781 if (arg->ch == '*') {
13782 v = unicode_format_getnextarg(ctx);
13783 if (v == NULL)
13784 return -1;
13785 if (!PyLong_Check(v)) {
13786 PyErr_SetString(PyExc_TypeError,
13787 "* wants int");
13788 return -1;
13789 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013790 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013791 if (arg->prec == -1 && PyErr_Occurred())
13792 return -1;
13793 if (arg->prec < 0)
13794 arg->prec = 0;
13795 if (--ctx->fmtcnt >= 0) {
13796 arg->ch = FORMAT_READ(ctx);
13797 ctx->fmtpos++;
13798 }
13799 }
13800 else if (arg->ch >= '0' && arg->ch <= '9') {
13801 arg->prec = arg->ch - '0';
13802 while (--ctx->fmtcnt >= 0) {
13803 arg->ch = FORMAT_READ(ctx);
13804 ctx->fmtpos++;
13805 if (arg->ch < '0' || arg->ch > '9')
13806 break;
13807 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13808 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013809 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013810 return -1;
13811 }
13812 arg->prec = arg->prec*10 + (arg->ch - '0');
13813 }
13814 }
13815 }
13816
13817 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13818 if (ctx->fmtcnt >= 0) {
13819 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13820 if (--ctx->fmtcnt >= 0) {
13821 arg->ch = FORMAT_READ(ctx);
13822 ctx->fmtpos++;
13823 }
13824 }
13825 }
13826 if (ctx->fmtcnt < 0) {
13827 PyErr_SetString(PyExc_ValueError,
13828 "incomplete format");
13829 return -1;
13830 }
13831 return 0;
13832
13833#undef FORMAT_READ
13834}
13835
13836/* Format one argument. Supported conversion specifiers:
13837
13838 - "s", "r", "a": any type
13839 - "i", "d", "u", "o", "x", "X": int
13840 - "e", "E", "f", "F", "g", "G": float
13841 - "c": int or str (1 character)
13842
Victor Stinner8dbd4212012-12-04 09:30:24 +010013843 When possible, the output is written directly into the Unicode writer
13844 (ctx->writer). A string is created when padding is required.
13845
Victor Stinnera47082312012-10-04 02:19:54 +020013846 Return 0 if the argument has been formatted into *p_str,
13847 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013848 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013849static int
13850unicode_format_arg_format(struct unicode_formatter_t *ctx,
13851 struct unicode_format_arg_t *arg,
13852 PyObject **p_str)
13853{
13854 PyObject *v;
13855 _PyUnicodeWriter *writer = &ctx->writer;
13856
13857 if (ctx->fmtcnt == 0)
13858 ctx->writer.overallocate = 0;
13859
13860 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013861 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020013862 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013863 return 1;
13864 }
13865
13866 v = unicode_format_getnextarg(ctx);
13867 if (v == NULL)
13868 return -1;
13869
Victor Stinnera47082312012-10-04 02:19:54 +020013870
13871 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013872 case 's':
13873 case 'r':
13874 case 'a':
13875 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13876 /* Fast path */
13877 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13878 return -1;
13879 return 1;
13880 }
13881
13882 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13883 *p_str = v;
13884 Py_INCREF(*p_str);
13885 }
13886 else {
13887 if (arg->ch == 's')
13888 *p_str = PyObject_Str(v);
13889 else if (arg->ch == 'r')
13890 *p_str = PyObject_Repr(v);
13891 else
13892 *p_str = PyObject_ASCII(v);
13893 }
13894 break;
13895
13896 case 'i':
13897 case 'd':
13898 case 'u':
13899 case 'o':
13900 case 'x':
13901 case 'X':
13902 {
13903 int ret = mainformatlong(v, arg, p_str, writer);
13904 if (ret != 0)
13905 return ret;
13906 arg->sign = 1;
13907 break;
13908 }
13909
13910 case 'e':
13911 case 'E':
13912 case 'f':
13913 case 'F':
13914 case 'g':
13915 case 'G':
13916 if (arg->width == -1 && arg->prec == -1
13917 && !(arg->flags & (F_SIGN | F_BLANK)))
13918 {
13919 /* Fast path */
13920 if (formatfloat(v, arg, NULL, writer) == -1)
13921 return -1;
13922 return 1;
13923 }
13924
13925 arg->sign = 1;
13926 if (formatfloat(v, arg, p_str, NULL) == -1)
13927 return -1;
13928 break;
13929
13930 case 'c':
13931 {
13932 Py_UCS4 ch = formatchar(v);
13933 if (ch == (Py_UCS4) -1)
13934 return -1;
13935 if (arg->width == -1 && arg->prec == -1) {
13936 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013937 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020013938 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013939 return 1;
13940 }
13941 *p_str = PyUnicode_FromOrdinal(ch);
13942 break;
13943 }
13944
13945 default:
13946 PyErr_Format(PyExc_ValueError,
13947 "unsupported format character '%c' (0x%x) "
13948 "at index %zd",
13949 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13950 (int)arg->ch,
13951 ctx->fmtpos - 1);
13952 return -1;
13953 }
13954 if (*p_str == NULL)
13955 return -1;
13956 assert (PyUnicode_Check(*p_str));
13957 return 0;
13958}
13959
13960static int
13961unicode_format_arg_output(struct unicode_formatter_t *ctx,
13962 struct unicode_format_arg_t *arg,
13963 PyObject *str)
13964{
13965 Py_ssize_t len;
13966 enum PyUnicode_Kind kind;
13967 void *pbuf;
13968 Py_ssize_t pindex;
13969 Py_UCS4 signchar;
13970 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013971 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020013972 Py_ssize_t sublen;
13973 _PyUnicodeWriter *writer = &ctx->writer;
13974 Py_UCS4 fill;
13975
13976 fill = ' ';
13977 if (arg->sign && arg->flags & F_ZERO)
13978 fill = '0';
13979
13980 if (PyUnicode_READY(str) == -1)
13981 return -1;
13982
13983 len = PyUnicode_GET_LENGTH(str);
13984 if ((arg->width == -1 || arg->width <= len)
13985 && (arg->prec == -1 || arg->prec >= len)
13986 && !(arg->flags & (F_SIGN | F_BLANK)))
13987 {
13988 /* Fast path */
13989 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13990 return -1;
13991 return 0;
13992 }
13993
13994 /* Truncate the string for "s", "r" and "a" formats
13995 if the precision is set */
13996 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13997 if (arg->prec >= 0 && len > arg->prec)
13998 len = arg->prec;
13999 }
14000
14001 /* Adjust sign and width */
14002 kind = PyUnicode_KIND(str);
14003 pbuf = PyUnicode_DATA(str);
14004 pindex = 0;
14005 signchar = '\0';
14006 if (arg->sign) {
14007 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14008 if (ch == '-' || ch == '+') {
14009 signchar = ch;
14010 len--;
14011 pindex++;
14012 }
14013 else if (arg->flags & F_SIGN)
14014 signchar = '+';
14015 else if (arg->flags & F_BLANK)
14016 signchar = ' ';
14017 else
14018 arg->sign = 0;
14019 }
14020 if (arg->width < len)
14021 arg->width = len;
14022
14023 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014024 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014025 if (!(arg->flags & F_LJUST)) {
14026 if (arg->sign) {
14027 if ((arg->width-1) > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014028 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014029 }
14030 else {
14031 if (arg->width > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014032 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014033 }
14034 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014035 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14036 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14037 maxchar = MAX_MAXCHAR(maxchar, strmaxchar);
14038 }
14039
Victor Stinnera47082312012-10-04 02:19:54 +020014040 buflen = arg->width;
14041 if (arg->sign && len == arg->width)
14042 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014043 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014044 return -1;
14045
14046 /* Write the sign if needed */
14047 if (arg->sign) {
14048 if (fill != ' ') {
14049 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14050 writer->pos += 1;
14051 }
14052 if (arg->width > len)
14053 arg->width--;
14054 }
14055
14056 /* Write the numeric prefix for "x", "X" and "o" formats
14057 if the alternate form is used.
14058 For example, write "0x" for the "%#x" format. */
14059 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14060 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14061 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14062 if (fill != ' ') {
14063 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14064 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14065 writer->pos += 2;
14066 pindex += 2;
14067 }
14068 arg->width -= 2;
14069 if (arg->width < 0)
14070 arg->width = 0;
14071 len -= 2;
14072 }
14073
14074 /* Pad left with the fill character if needed */
14075 if (arg->width > len && !(arg->flags & F_LJUST)) {
14076 sublen = arg->width - len;
14077 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14078 writer->pos += sublen;
14079 arg->width = len;
14080 }
14081
14082 /* If padding with spaces: write sign if needed and/or numeric prefix if
14083 the alternate form is used */
14084 if (fill == ' ') {
14085 if (arg->sign) {
14086 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14087 writer->pos += 1;
14088 }
14089 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14090 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14091 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14092 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14093 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14094 writer->pos += 2;
14095 pindex += 2;
14096 }
14097 }
14098
14099 /* Write characters */
14100 if (len) {
14101 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14102 str, pindex, len);
14103 writer->pos += len;
14104 }
14105
14106 /* Pad right with the fill character if needed */
14107 if (arg->width > len) {
14108 sublen = arg->width - len;
14109 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14110 writer->pos += sublen;
14111 }
14112 return 0;
14113}
14114
14115/* Helper of PyUnicode_Format(): format one arg.
14116 Return 0 on success, raise an exception and return -1 on error. */
14117static int
14118unicode_format_arg(struct unicode_formatter_t *ctx)
14119{
14120 struct unicode_format_arg_t arg;
14121 PyObject *str;
14122 int ret;
14123
Victor Stinner8dbd4212012-12-04 09:30:24 +010014124 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14125 arg.flags = 0;
14126 arg.width = -1;
14127 arg.prec = -1;
14128 arg.sign = 0;
14129 str = NULL;
14130
Victor Stinnera47082312012-10-04 02:19:54 +020014131 ret = unicode_format_arg_parse(ctx, &arg);
14132 if (ret == -1)
14133 return -1;
14134
14135 ret = unicode_format_arg_format(ctx, &arg, &str);
14136 if (ret == -1)
14137 return -1;
14138
14139 if (ret != 1) {
14140 ret = unicode_format_arg_output(ctx, &arg, str);
14141 Py_DECREF(str);
14142 if (ret == -1)
14143 return -1;
14144 }
14145
14146 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14147 PyErr_SetString(PyExc_TypeError,
14148 "not all arguments converted during string formatting");
14149 return -1;
14150 }
14151 return 0;
14152}
14153
Alexander Belopolsky40018472011-02-26 01:02:56 +000014154PyObject *
14155PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014156{
Victor Stinnera47082312012-10-04 02:19:54 +020014157 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014158
Guido van Rossumd57fd912000-03-10 22:53:23 +000014159 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014160 PyErr_BadInternalCall();
14161 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014162 }
Victor Stinnera47082312012-10-04 02:19:54 +020014163
14164 ctx.fmtstr = PyUnicode_FromObject(format);
14165 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014166 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014167 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14168 Py_DECREF(ctx.fmtstr);
14169 return NULL;
14170 }
14171 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14172 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14173 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14174 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014175
Victor Stinnera47082312012-10-04 02:19:54 +020014176 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014177
Guido van Rossumd57fd912000-03-10 22:53:23 +000014178 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014179 ctx.arglen = PyTuple_Size(args);
14180 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014181 }
14182 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014183 ctx.arglen = -1;
14184 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014185 }
Victor Stinnera47082312012-10-04 02:19:54 +020014186 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014187 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014188 ctx.dict = args;
14189 else
14190 ctx.dict = NULL;
14191 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014192
Victor Stinnera47082312012-10-04 02:19:54 +020014193 while (--ctx.fmtcnt >= 0) {
14194 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014195 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014196
14197 nonfmtpos = ctx.fmtpos++;
14198 while (ctx.fmtcnt >= 0 &&
14199 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14200 ctx.fmtpos++;
14201 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014202 }
Victor Stinnera47082312012-10-04 02:19:54 +020014203 if (ctx.fmtcnt < 0) {
14204 ctx.fmtpos--;
14205 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014206 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014207
Victor Stinnercfc4c132013-04-03 01:48:39 +020014208 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14209 nonfmtpos, ctx.fmtpos) < 0)
14210 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014211 }
14212 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014213 ctx.fmtpos++;
14214 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014215 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014216 }
14217 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014218
Victor Stinnera47082312012-10-04 02:19:54 +020014219 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014220 PyErr_SetString(PyExc_TypeError,
14221 "not all arguments converted during string formatting");
14222 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014223 }
14224
Victor Stinnera47082312012-10-04 02:19:54 +020014225 if (ctx.args_owned) {
14226 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014227 }
Victor Stinnera47082312012-10-04 02:19:54 +020014228 Py_DECREF(ctx.fmtstr);
14229 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014230
Benjamin Peterson29060642009-01-31 22:14:21 +000014231 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014232 Py_DECREF(ctx.fmtstr);
14233 _PyUnicodeWriter_Dealloc(&ctx.writer);
14234 if (ctx.args_owned) {
14235 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014236 }
14237 return NULL;
14238}
14239
Jeremy Hylton938ace62002-07-17 16:30:39 +000014240static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014241unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14242
Tim Peters6d6c1a32001-08-02 04:15:00 +000014243static PyObject *
14244unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14245{
Benjamin Peterson29060642009-01-31 22:14:21 +000014246 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014247 static char *kwlist[] = {"object", "encoding", "errors", 0};
14248 char *encoding = NULL;
14249 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014250
Benjamin Peterson14339b62009-01-31 16:36:08 +000014251 if (type != &PyUnicode_Type)
14252 return unicode_subtype_new(type, args, kwds);
14253 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014254 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014255 return NULL;
14256 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014257 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014258 if (encoding == NULL && errors == NULL)
14259 return PyObject_Str(x);
14260 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014261 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014262}
14263
Guido van Rossume023fe02001-08-30 03:12:59 +000014264static PyObject *
14265unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14266{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014267 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014268 Py_ssize_t length, char_size;
14269 int share_wstr, share_utf8;
14270 unsigned int kind;
14271 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014272
Benjamin Peterson14339b62009-01-31 16:36:08 +000014273 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014274
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014275 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014276 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014277 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014278 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014279 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014280 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014281 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014282 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014283
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014284 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014285 if (self == NULL) {
14286 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014287 return NULL;
14288 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014289 kind = PyUnicode_KIND(unicode);
14290 length = PyUnicode_GET_LENGTH(unicode);
14291
14292 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014293#ifdef Py_DEBUG
14294 _PyUnicode_HASH(self) = -1;
14295#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014296 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014297#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014298 _PyUnicode_STATE(self).interned = 0;
14299 _PyUnicode_STATE(self).kind = kind;
14300 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014301 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014302 _PyUnicode_STATE(self).ready = 1;
14303 _PyUnicode_WSTR(self) = NULL;
14304 _PyUnicode_UTF8_LENGTH(self) = 0;
14305 _PyUnicode_UTF8(self) = NULL;
14306 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014307 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014308
14309 share_utf8 = 0;
14310 share_wstr = 0;
14311 if (kind == PyUnicode_1BYTE_KIND) {
14312 char_size = 1;
14313 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14314 share_utf8 = 1;
14315 }
14316 else if (kind == PyUnicode_2BYTE_KIND) {
14317 char_size = 2;
14318 if (sizeof(wchar_t) == 2)
14319 share_wstr = 1;
14320 }
14321 else {
14322 assert(kind == PyUnicode_4BYTE_KIND);
14323 char_size = 4;
14324 if (sizeof(wchar_t) == 4)
14325 share_wstr = 1;
14326 }
14327
14328 /* Ensure we won't overflow the length. */
14329 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14330 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014331 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014332 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014333 data = PyObject_MALLOC((length + 1) * char_size);
14334 if (data == NULL) {
14335 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014336 goto onError;
14337 }
14338
Victor Stinnerc3c74152011-10-02 20:39:55 +020014339 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014340 if (share_utf8) {
14341 _PyUnicode_UTF8_LENGTH(self) = length;
14342 _PyUnicode_UTF8(self) = data;
14343 }
14344 if (share_wstr) {
14345 _PyUnicode_WSTR_LENGTH(self) = length;
14346 _PyUnicode_WSTR(self) = (wchar_t *)data;
14347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014348
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014349 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014350 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014351 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014352#ifdef Py_DEBUG
14353 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14354#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014355 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014356 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014357
14358onError:
14359 Py_DECREF(unicode);
14360 Py_DECREF(self);
14361 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014362}
14363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014364PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014365"str(object='') -> str\n\
14366str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014367\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014368Create a new string object from the given object. If encoding or\n\
14369errors is specified, then the object must expose a data buffer\n\
14370that will be decoded using the given encoding and error handler.\n\
14371Otherwise, returns the result of object.__str__() (if defined)\n\
14372or repr(object).\n\
14373encoding defaults to sys.getdefaultencoding().\n\
14374errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014375
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014376static PyObject *unicode_iter(PyObject *seq);
14377
Guido van Rossumd57fd912000-03-10 22:53:23 +000014378PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014379 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014380 "str", /* tp_name */
14381 sizeof(PyUnicodeObject), /* tp_size */
14382 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014383 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014384 (destructor)unicode_dealloc, /* tp_dealloc */
14385 0, /* tp_print */
14386 0, /* tp_getattr */
14387 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014388 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014389 unicode_repr, /* tp_repr */
14390 &unicode_as_number, /* tp_as_number */
14391 &unicode_as_sequence, /* tp_as_sequence */
14392 &unicode_as_mapping, /* tp_as_mapping */
14393 (hashfunc) unicode_hash, /* tp_hash*/
14394 0, /* tp_call*/
14395 (reprfunc) unicode_str, /* tp_str */
14396 PyObject_GenericGetAttr, /* tp_getattro */
14397 0, /* tp_setattro */
14398 0, /* tp_as_buffer */
14399 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014400 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014401 unicode_doc, /* tp_doc */
14402 0, /* tp_traverse */
14403 0, /* tp_clear */
14404 PyUnicode_RichCompare, /* tp_richcompare */
14405 0, /* tp_weaklistoffset */
14406 unicode_iter, /* tp_iter */
14407 0, /* tp_iternext */
14408 unicode_methods, /* tp_methods */
14409 0, /* tp_members */
14410 0, /* tp_getset */
14411 &PyBaseObject_Type, /* tp_base */
14412 0, /* tp_dict */
14413 0, /* tp_descr_get */
14414 0, /* tp_descr_set */
14415 0, /* tp_dictoffset */
14416 0, /* tp_init */
14417 0, /* tp_alloc */
14418 unicode_new, /* tp_new */
14419 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014420};
14421
14422/* Initialize the Unicode implementation */
14423
Victor Stinner3a50e702011-10-18 21:21:00 +020014424int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014425{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014426 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014427 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014428 0x000A, /* LINE FEED */
14429 0x000D, /* CARRIAGE RETURN */
14430 0x001C, /* FILE SEPARATOR */
14431 0x001D, /* GROUP SEPARATOR */
14432 0x001E, /* RECORD SEPARATOR */
14433 0x0085, /* NEXT LINE */
14434 0x2028, /* LINE SEPARATOR */
14435 0x2029, /* PARAGRAPH SEPARATOR */
14436 };
14437
Fred Drakee4315f52000-05-09 19:53:39 +000014438 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014439 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014440 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014441 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014442 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014443
Guido van Rossumcacfc072002-05-24 19:01:59 +000014444 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014445 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014446
14447 /* initialize the linebreak bloom filter */
14448 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014449 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014450 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014451
14452 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014453
Benjamin Petersonc4311282012-10-30 23:21:10 -040014454 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14455 Py_FatalError("Can't initialize field name iterator type");
14456
14457 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14458 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014459
Victor Stinner3a50e702011-10-18 21:21:00 +020014460#ifdef HAVE_MBCS
14461 winver.dwOSVersionInfoSize = sizeof(winver);
14462 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14463 PyErr_SetFromWindowsErr(0);
14464 return -1;
14465 }
14466#endif
14467 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014468}
14469
14470/* Finalize the Unicode implementation */
14471
Christian Heimesa156e092008-02-16 07:38:31 +000014472int
14473PyUnicode_ClearFreeList(void)
14474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014475 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014476}
14477
Guido van Rossumd57fd912000-03-10 22:53:23 +000014478void
Thomas Wouters78890102000-07-22 19:25:51 +000014479_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014480{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014481 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014482
Serhiy Storchaka05997252013-01-26 12:14:02 +020014483 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014484
Serhiy Storchaka05997252013-01-26 12:14:02 +020014485 for (i = 0; i < 256; i++)
14486 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014487 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014488 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014489}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014490
Walter Dörwald16807132007-05-25 13:52:07 +000014491void
14492PyUnicode_InternInPlace(PyObject **p)
14493{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014494 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014495 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014496#ifdef Py_DEBUG
14497 assert(s != NULL);
14498 assert(_PyUnicode_CHECK(s));
14499#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014500 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014501 return;
14502#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014503 /* If it's a subclass, we don't really know what putting
14504 it in the interned dict might do. */
14505 if (!PyUnicode_CheckExact(s))
14506 return;
14507 if (PyUnicode_CHECK_INTERNED(s))
14508 return;
14509 if (interned == NULL) {
14510 interned = PyDict_New();
14511 if (interned == NULL) {
14512 PyErr_Clear(); /* Don't leave an exception */
14513 return;
14514 }
14515 }
14516 /* It might be that the GetItem call fails even
14517 though the key is present in the dictionary,
14518 namely when this happens during a stack overflow. */
14519 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014520 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014521 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014522
Benjamin Peterson29060642009-01-31 22:14:21 +000014523 if (t) {
14524 Py_INCREF(t);
14525 Py_DECREF(*p);
14526 *p = t;
14527 return;
14528 }
Walter Dörwald16807132007-05-25 13:52:07 +000014529
Benjamin Peterson14339b62009-01-31 16:36:08 +000014530 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014531 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014532 PyErr_Clear();
14533 PyThreadState_GET()->recursion_critical = 0;
14534 return;
14535 }
14536 PyThreadState_GET()->recursion_critical = 0;
14537 /* The two references in interned are not counted by refcnt.
14538 The deallocator will take care of this */
14539 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014540 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014541}
14542
14543void
14544PyUnicode_InternImmortal(PyObject **p)
14545{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014546 PyUnicode_InternInPlace(p);
14547 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014548 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014549 Py_INCREF(*p);
14550 }
Walter Dörwald16807132007-05-25 13:52:07 +000014551}
14552
14553PyObject *
14554PyUnicode_InternFromString(const char *cp)
14555{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014556 PyObject *s = PyUnicode_FromString(cp);
14557 if (s == NULL)
14558 return NULL;
14559 PyUnicode_InternInPlace(&s);
14560 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014561}
14562
Alexander Belopolsky40018472011-02-26 01:02:56 +000014563void
14564_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014565{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014566 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014567 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014568 Py_ssize_t i, n;
14569 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014570
Benjamin Peterson14339b62009-01-31 16:36:08 +000014571 if (interned == NULL || !PyDict_Check(interned))
14572 return;
14573 keys = PyDict_Keys(interned);
14574 if (keys == NULL || !PyList_Check(keys)) {
14575 PyErr_Clear();
14576 return;
14577 }
Walter Dörwald16807132007-05-25 13:52:07 +000014578
Benjamin Peterson14339b62009-01-31 16:36:08 +000014579 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14580 detector, interned unicode strings are not forcibly deallocated;
14581 rather, we give them their stolen references back, and then clear
14582 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014583
Benjamin Peterson14339b62009-01-31 16:36:08 +000014584 n = PyList_GET_SIZE(keys);
14585 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014586 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014587 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014588 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014589 if (PyUnicode_READY(s) == -1) {
14590 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014591 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014593 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014594 case SSTATE_NOT_INTERNED:
14595 /* XXX Shouldn't happen */
14596 break;
14597 case SSTATE_INTERNED_IMMORTAL:
14598 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014599 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014600 break;
14601 case SSTATE_INTERNED_MORTAL:
14602 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014603 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014604 break;
14605 default:
14606 Py_FatalError("Inconsistent interned string state.");
14607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014608 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014609 }
14610 fprintf(stderr, "total size of all interned strings: "
14611 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14612 "mortal/immortal\n", mortal_size, immortal_size);
14613 Py_DECREF(keys);
14614 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014615 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014616}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014617
14618
14619/********************* Unicode Iterator **************************/
14620
14621typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014622 PyObject_HEAD
14623 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014624 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014625} unicodeiterobject;
14626
14627static void
14628unicodeiter_dealloc(unicodeiterobject *it)
14629{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014630 _PyObject_GC_UNTRACK(it);
14631 Py_XDECREF(it->it_seq);
14632 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014633}
14634
14635static int
14636unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14637{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014638 Py_VISIT(it->it_seq);
14639 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014640}
14641
14642static PyObject *
14643unicodeiter_next(unicodeiterobject *it)
14644{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014645 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014646
Benjamin Peterson14339b62009-01-31 16:36:08 +000014647 assert(it != NULL);
14648 seq = it->it_seq;
14649 if (seq == NULL)
14650 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014651 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014653 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14654 int kind = PyUnicode_KIND(seq);
14655 void *data = PyUnicode_DATA(seq);
14656 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14657 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014658 if (item != NULL)
14659 ++it->it_index;
14660 return item;
14661 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014662
Benjamin Peterson14339b62009-01-31 16:36:08 +000014663 Py_DECREF(seq);
14664 it->it_seq = NULL;
14665 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014666}
14667
14668static PyObject *
14669unicodeiter_len(unicodeiterobject *it)
14670{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014671 Py_ssize_t len = 0;
14672 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014673 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014674 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014675}
14676
14677PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14678
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014679static PyObject *
14680unicodeiter_reduce(unicodeiterobject *it)
14681{
14682 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014683 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014684 it->it_seq, it->it_index);
14685 } else {
14686 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14687 if (u == NULL)
14688 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014689 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014690 }
14691}
14692
14693PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14694
14695static PyObject *
14696unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14697{
14698 Py_ssize_t index = PyLong_AsSsize_t(state);
14699 if (index == -1 && PyErr_Occurred())
14700 return NULL;
14701 if (index < 0)
14702 index = 0;
14703 it->it_index = index;
14704 Py_RETURN_NONE;
14705}
14706
14707PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14708
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014709static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014710 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014711 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014712 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14713 reduce_doc},
14714 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14715 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014716 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014717};
14718
14719PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014720 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14721 "str_iterator", /* tp_name */
14722 sizeof(unicodeiterobject), /* tp_basicsize */
14723 0, /* tp_itemsize */
14724 /* methods */
14725 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14726 0, /* tp_print */
14727 0, /* tp_getattr */
14728 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014729 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014730 0, /* tp_repr */
14731 0, /* tp_as_number */
14732 0, /* tp_as_sequence */
14733 0, /* tp_as_mapping */
14734 0, /* tp_hash */
14735 0, /* tp_call */
14736 0, /* tp_str */
14737 PyObject_GenericGetAttr, /* tp_getattro */
14738 0, /* tp_setattro */
14739 0, /* tp_as_buffer */
14740 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14741 0, /* tp_doc */
14742 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14743 0, /* tp_clear */
14744 0, /* tp_richcompare */
14745 0, /* tp_weaklistoffset */
14746 PyObject_SelfIter, /* tp_iter */
14747 (iternextfunc)unicodeiter_next, /* tp_iternext */
14748 unicodeiter_methods, /* tp_methods */
14749 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014750};
14751
14752static PyObject *
14753unicode_iter(PyObject *seq)
14754{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014755 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014756
Benjamin Peterson14339b62009-01-31 16:36:08 +000014757 if (!PyUnicode_Check(seq)) {
14758 PyErr_BadInternalCall();
14759 return NULL;
14760 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014761 if (PyUnicode_READY(seq) == -1)
14762 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014763 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14764 if (it == NULL)
14765 return NULL;
14766 it->it_index = 0;
14767 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014768 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014769 _PyObject_GC_TRACK(it);
14770 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014771}
14772
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014773
14774size_t
14775Py_UNICODE_strlen(const Py_UNICODE *u)
14776{
14777 int res = 0;
14778 while(*u++)
14779 res++;
14780 return res;
14781}
14782
14783Py_UNICODE*
14784Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14785{
14786 Py_UNICODE *u = s1;
14787 while ((*u++ = *s2++));
14788 return s1;
14789}
14790
14791Py_UNICODE*
14792Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14793{
14794 Py_UNICODE *u = s1;
14795 while ((*u++ = *s2++))
14796 if (n-- == 0)
14797 break;
14798 return s1;
14799}
14800
14801Py_UNICODE*
14802Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14803{
14804 Py_UNICODE *u1 = s1;
14805 u1 += Py_UNICODE_strlen(u1);
14806 Py_UNICODE_strcpy(u1, s2);
14807 return s1;
14808}
14809
14810int
14811Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14812{
14813 while (*s1 && *s2 && *s1 == *s2)
14814 s1++, s2++;
14815 if (*s1 && *s2)
14816 return (*s1 < *s2) ? -1 : +1;
14817 if (*s1)
14818 return 1;
14819 if (*s2)
14820 return -1;
14821 return 0;
14822}
14823
14824int
14825Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14826{
14827 register Py_UNICODE u1, u2;
14828 for (; n != 0; n--) {
14829 u1 = *s1;
14830 u2 = *s2;
14831 if (u1 != u2)
14832 return (u1 < u2) ? -1 : +1;
14833 if (u1 == '\0')
14834 return 0;
14835 s1++;
14836 s2++;
14837 }
14838 return 0;
14839}
14840
14841Py_UNICODE*
14842Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14843{
14844 const Py_UNICODE *p;
14845 for (p = s; *p; p++)
14846 if (*p == c)
14847 return (Py_UNICODE*)p;
14848 return NULL;
14849}
14850
14851Py_UNICODE*
14852Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14853{
14854 const Py_UNICODE *p;
14855 p = s + Py_UNICODE_strlen(s);
14856 while (p != s) {
14857 p--;
14858 if (*p == c)
14859 return (Py_UNICODE*)p;
14860 }
14861 return NULL;
14862}
Victor Stinner331ea922010-08-10 16:37:20 +000014863
Victor Stinner71133ff2010-09-01 23:43:53 +000014864Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014865PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014866{
Victor Stinner577db2c2011-10-11 22:12:48 +020014867 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014868 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014870 if (!PyUnicode_Check(unicode)) {
14871 PyErr_BadArgument();
14872 return NULL;
14873 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014874 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014875 if (u == NULL)
14876 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014877 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014878 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014879 PyErr_NoMemory();
14880 return NULL;
14881 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014882 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014883 size *= sizeof(Py_UNICODE);
14884 copy = PyMem_Malloc(size);
14885 if (copy == NULL) {
14886 PyErr_NoMemory();
14887 return NULL;
14888 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014889 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014890 return copy;
14891}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014892
Georg Brandl66c221e2010-10-14 07:04:07 +000014893/* A _string module, to export formatter_parser and formatter_field_name_split
14894 to the string.Formatter class implemented in Python. */
14895
14896static PyMethodDef _string_methods[] = {
14897 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14898 METH_O, PyDoc_STR("split the argument as a field name")},
14899 {"formatter_parser", (PyCFunction) formatter_parser,
14900 METH_O, PyDoc_STR("parse the argument as a format string")},
14901 {NULL, NULL}
14902};
14903
14904static struct PyModuleDef _string_module = {
14905 PyModuleDef_HEAD_INIT,
14906 "_string",
14907 PyDoc_STR("string helper module"),
14908 0,
14909 _string_methods,
14910 NULL,
14911 NULL,
14912 NULL,
14913 NULL
14914};
14915
14916PyMODINIT_FUNC
14917PyInit__string(void)
14918{
14919 return PyModule_Create(&_string_module);
14920}
14921
14922
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014923#ifdef __cplusplus
14924}
14925#endif