blob: bf49ce5962c6ef29b280f0dc1d361086d461b1b7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinnere6abb482012-05-02 01:15:40 +0200107/* Optimized version of Py_MAX() to compute the maximum character:
108 use it when your are computing the second argument of PyUnicode_New() */
109#define MAX_MAXCHAR(maxchar1, maxchar2) \
110 ((maxchar1) | (maxchar2))
111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (!PyUnicode_IS_COMPACT_ASCII(op) \
132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (_PyUnicode_WSTR(op) && \
140 (!PyUnicode_IS_READY(op) || \
141 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
142
Victor Stinner910337b2011-10-03 03:20:16 +0200143/* Generic helper macro to convert characters of different types.
144 from_type and to_type have to be valid type names, begin and end
145 are pointers to the source characters which should be of type
146 "from_type *". to is a pointer of type "to_type *" and points to the
147 buffer where the result characters are written to. */
148#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
149 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200150 to_type *_to = (to_type *) to; \
151 const from_type *_iter = (begin); \
152 const from_type *_end = (end); \
153 Py_ssize_t n = (_end) - (_iter); \
154 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200155 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_unrolled_end)) { \
157 _to[0] = (to_type) _iter[0]; \
158 _to[1] = (to_type) _iter[1]; \
159 _to[2] = (to_type) _iter[2]; \
160 _to[3] = (to_type) _iter[3]; \
161 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200162 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_end)) \
164 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166
Walter Dörwald16807132007-05-25 13:52:07 +0000167/* This dictionary holds all interned unicode strings. Note that references
168 to strings in this dictionary are *not* counted in the string's ob_refcnt.
169 When the interned string reaches a refcnt of 0 the string deallocation
170 function will delete the reference from this dictionary.
171
172 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000173 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000174*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200175static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000177/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179
Serhiy Storchaka678db842013-01-26 12:16:36 +0200180#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200181 do { \
182 if (unicode_empty != NULL) \
183 Py_INCREF(unicode_empty); \
184 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185 unicode_empty = PyUnicode_New(0, 0); \
186 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
189 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200191 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193#define _Py_RETURN_UNICODE_EMPTY() \
194 do { \
195 _Py_INCREF_UNICODE_EMPTY(); \
196 return unicode_empty; \
197 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200199/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202/* Single character Unicode strings in the Latin-1 range are being
203 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Christian Heimes190d79e2008-01-30 11:58:22 +0000206/* Fast detection of the most frequent whitespace characters */
207const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000C: * FORM FEED */
213/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000214 0, 1, 1, 1, 1, 1, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000216/* case 0x001C: * FILE SEPARATOR */
217/* case 0x001D: * GROUP SEPARATOR */
218/* case 0x001E: * RECORD SEPARATOR */
219/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 1, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000226
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000235};
236
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100240static int unicode_modifiable(PyObject *unicode);
241
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100244_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200245static PyObject *
246_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
247static PyObject *
248_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
249
250static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000251unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100253 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256static void
257raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300258 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100259 PyObject *unicode,
260 Py_ssize_t startpos, Py_ssize_t endpos,
261 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000262
Christian Heimes190d79e2008-01-30 11:58:22 +0000263/* Same for linebreaks */
264static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267/* 0x000B, * LINE TABULATION */
268/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x001C, * FILE SEPARATOR */
273/* 0x001D, * GROUP SEPARATOR */
274/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 1, 1, 1, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000280
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000289};
290
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300291/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
292 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000294PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000296#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 /* This is actually an illegal character, so it should
300 not be passed to unichr. */
301 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#endif
303}
304
Victor Stinner910337b2011-10-03 03:20:16 +0200305#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200306int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100307_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200308{
309 PyASCIIObject *ascii;
310 unsigned int kind;
311
312 assert(PyUnicode_Check(op));
313
314 ascii = (PyASCIIObject *)op;
315 kind = ascii->state.kind;
316
Victor Stinnera3b334d2011-10-03 13:53:37 +0200317 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200318 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
320 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200322 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200323 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200324
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 if (ascii->state.compact == 1) {
326 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200327 assert(kind == PyUnicode_1BYTE_KIND
328 || kind == PyUnicode_2BYTE_KIND
329 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 }
334 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
336
337 data = unicode->data.any;
338 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100339 assert(ascii->length == 0);
340 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ascii == 0);
343 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100344 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->wstr != NULL);
346 assert(data == NULL);
347 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200348 }
349 else {
350 assert(kind == PyUnicode_1BYTE_KIND
351 || kind == PyUnicode_2BYTE_KIND
352 || kind == PyUnicode_4BYTE_KIND);
353 assert(ascii->state.compact == 0);
354 assert(ascii->state.ready == 1);
355 assert(data != NULL);
356 if (ascii->state.ascii) {
357 assert (compact->utf8 == data);
358 assert (compact->utf8_length == ascii->length);
359 }
360 else
361 assert (compact->utf8 != data);
362 }
363 }
364 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200365 if (
366#if SIZEOF_WCHAR_T == 2
367 kind == PyUnicode_2BYTE_KIND
368#else
369 kind == PyUnicode_4BYTE_KIND
370#endif
371 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200372 {
373 assert(ascii->wstr == data);
374 assert(compact->wstr_length == ascii->length);
375 } else
376 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200378
379 if (compact->utf8 == NULL)
380 assert(compact->utf8_length == 0);
381 if (ascii->wstr == NULL)
382 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 /* check that the best kind is used */
385 if (check_content && kind != PyUnicode_WCHAR_KIND)
386 {
387 Py_ssize_t i;
388 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 void *data;
390 Py_UCS4 ch;
391
392 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 for (i=0; i < ascii->length; i++)
394 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 if (ch > maxchar)
397 maxchar = ch;
398 }
399 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100400 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 assert(maxchar <= 255);
403 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 else
405 assert(maxchar < 128);
406 }
Victor Stinner77faf692011-11-20 18:56:05 +0100407 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100409 assert(maxchar <= 0xFFFF);
410 }
411 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100413 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100414 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200415 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400417 return 1;
418}
Victor Stinner910337b2011-10-03 03:20:16 +0200419#endif
420
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100421static PyObject*
422unicode_result_wchar(PyObject *unicode)
423{
424#ifndef Py_DEBUG
425 Py_ssize_t len;
426
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100427 len = _PyUnicode_WSTR_LENGTH(unicode);
428 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200430 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 }
432
433 if (len == 1) {
434 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100435 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100436 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
437 Py_DECREF(unicode);
438 return latin1_char;
439 }
440 }
441
442 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200443 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100444 return NULL;
445 }
446#else
Victor Stinneraa771272012-10-04 02:32:58 +0200447 assert(Py_REFCNT(unicode) == 1);
448
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100449 /* don't make the result ready in debug mode to ensure that the caller
450 makes the string ready before using it */
451 assert(_PyUnicode_CheckConsistency(unicode, 1));
452#endif
453 return unicode;
454}
455
456static PyObject*
457unicode_result_ready(PyObject *unicode)
458{
459 Py_ssize_t length;
460
461 length = PyUnicode_GET_LENGTH(unicode);
462 if (length == 0) {
463 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100464 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200465 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 }
467 return unicode_empty;
468 }
469
470 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200471 void *data = PyUnicode_DATA(unicode);
472 int kind = PyUnicode_KIND(unicode);
473 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 if (ch < 256) {
475 PyObject *latin1_char = unicode_latin1[ch];
476 if (latin1_char != NULL) {
477 if (unicode != latin1_char) {
478 Py_INCREF(latin1_char);
479 Py_DECREF(unicode);
480 }
481 return latin1_char;
482 }
483 else {
484 assert(_PyUnicode_CheckConsistency(unicode, 1));
485 Py_INCREF(unicode);
486 unicode_latin1[ch] = unicode;
487 return unicode;
488 }
489 }
490 }
491
492 assert(_PyUnicode_CheckConsistency(unicode, 1));
493 return unicode;
494}
495
496static PyObject*
497unicode_result(PyObject *unicode)
498{
499 assert(_PyUnicode_CHECK(unicode));
500 if (PyUnicode_IS_READY(unicode))
501 return unicode_result_ready(unicode);
502 else
503 return unicode_result_wchar(unicode);
504}
505
Victor Stinnerc4b49542011-12-11 22:44:26 +0100506static PyObject*
507unicode_result_unchanged(PyObject *unicode)
508{
509 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500510 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100511 return NULL;
512 Py_INCREF(unicode);
513 return unicode;
514 }
515 else
516 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100517 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100518}
519
Victor Stinner3a50e702011-10-18 21:21:00 +0200520#ifdef HAVE_MBCS
521static OSVERSIONINFOEX winver;
522#endif
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200590#define BLOOM_MEMBER(mask, chr, str) \
591 (BLOOM(mask, chr) \
592 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000593
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200594/* Compilation of templated routines */
595
596#include "stringlib/asciilib.h"
597#include "stringlib/fastsearch.h"
598#include "stringlib/partition.h"
599#include "stringlib/split.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
602#include "stringlib/find_max_char.h"
603#include "stringlib/localeutil.h"
604#include "stringlib/undef.h"
605
606#include "stringlib/ucs1lib.h"
607#include "stringlib/fastsearch.h"
608#include "stringlib/partition.h"
609#include "stringlib/split.h"
610#include "stringlib/count.h"
611#include "stringlib/find.h"
612#include "stringlib/find_max_char.h"
613#include "stringlib/localeutil.h"
614#include "stringlib/undef.h"
615
616#include "stringlib/ucs2lib.h"
617#include "stringlib/fastsearch.h"
618#include "stringlib/partition.h"
619#include "stringlib/split.h"
620#include "stringlib/count.h"
621#include "stringlib/find.h"
622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
632#include "stringlib/find_max_char.h"
633#include "stringlib/localeutil.h"
634#include "stringlib/undef.h"
635
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200636#include "stringlib/unicodedefs.h"
637#include "stringlib/fastsearch.h"
638#include "stringlib/count.h"
639#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100640#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200641
Guido van Rossumd57fd912000-03-10 22:53:23 +0000642/* --- Unicode Object ----------------------------------------------------- */
643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200644static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200645fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
648 Py_ssize_t size, Py_UCS4 ch,
649 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200650{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200651 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
652
653 switch (kind) {
654 case PyUnicode_1BYTE_KIND:
655 {
656 Py_UCS1 ch1 = (Py_UCS1) ch;
657 if (ch1 == ch)
658 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
659 else
660 return -1;
661 }
662 case PyUnicode_2BYTE_KIND:
663 {
664 Py_UCS2 ch2 = (Py_UCS2) ch;
665 if (ch2 == ch)
666 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
667 else
668 return -1;
669 }
670 case PyUnicode_4BYTE_KIND:
671 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
672 default:
673 assert(0);
674 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676}
677
Victor Stinnerafffce42012-10-03 23:03:17 +0200678#ifdef Py_DEBUG
679/* Fill the data of an Unicode string with invalid characters to detect bugs
680 earlier.
681
682 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
683 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
684 invalid character in Unicode 6.0. */
685static void
686unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
687{
688 int kind = PyUnicode_KIND(unicode);
689 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
690 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
691 if (length <= old_length)
692 return;
693 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
694}
695#endif
696
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697static PyObject*
698resize_compact(PyObject *unicode, Py_ssize_t length)
699{
700 Py_ssize_t char_size;
701 Py_ssize_t struct_size;
702 Py_ssize_t new_size;
703 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100704 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200705#ifdef Py_DEBUG
706 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
707#endif
708
Victor Stinner79891572012-05-03 13:43:07 +0200709 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200710 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100711 assert(PyUnicode_IS_COMPACT(unicode));
712
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200713 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100714 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715 struct_size = sizeof(PyASCIIObject);
716 else
717 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
721 PyErr_NoMemory();
722 return NULL;
723 }
724 new_size = (struct_size + (length + 1) * char_size);
725
Victor Stinner84def372011-12-11 20:04:56 +0100726 _Py_DEC_REFTOTAL;
727 _Py_ForgetReference(unicode);
728
729 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
730 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100731 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 PyErr_NoMemory();
733 return NULL;
734 }
Victor Stinner84def372011-12-11 20:04:56 +0100735 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100737
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200739 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100741 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200742 _PyUnicode_WSTR_LENGTH(unicode) = length;
743 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100744 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
745 PyObject_DEL(_PyUnicode_WSTR(unicode));
746 _PyUnicode_WSTR(unicode) = NULL;
747 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200748#ifdef Py_DEBUG
749 unicode_fill_invalid(unicode, old_length);
750#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200751 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
752 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200753 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200754 return unicode;
755}
756
Alexander Belopolsky40018472011-02-26 01:02:56 +0000757static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200758resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000759{
Victor Stinner95663112011-10-04 01:03:50 +0200760 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100761 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200762 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200763 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000764
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 if (PyUnicode_IS_READY(unicode)) {
766 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200767 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200769#ifdef Py_DEBUG
770 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
771#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200772
773 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200774 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200775 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
776 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200777
778 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
779 PyErr_NoMemory();
780 return -1;
781 }
782 new_size = (length + 1) * char_size;
783
Victor Stinner7a9105a2011-12-12 00:13:42 +0100784 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
785 {
786 PyObject_DEL(_PyUnicode_UTF8(unicode));
787 _PyUnicode_UTF8(unicode) = NULL;
788 _PyUnicode_UTF8_LENGTH(unicode) = 0;
789 }
790
Victor Stinnerfe226c02011-10-03 03:52:20 +0200791 data = (PyObject *)PyObject_REALLOC(data, new_size);
792 if (data == NULL) {
793 PyErr_NoMemory();
794 return -1;
795 }
796 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200797 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200799 _PyUnicode_WSTR_LENGTH(unicode) = length;
800 }
801 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200802 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200803 _PyUnicode_UTF8_LENGTH(unicode) = length;
804 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 _PyUnicode_LENGTH(unicode) = length;
806 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200807#ifdef Py_DEBUG
808 unicode_fill_invalid(unicode, old_length);
809#endif
Victor Stinner95663112011-10-04 01:03:50 +0200810 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200811 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinner95663112011-10-04 01:03:50 +0200815 assert(_PyUnicode_WSTR(unicode) != NULL);
816
817 /* check for integer overflow */
818 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
819 PyErr_NoMemory();
820 return -1;
821 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100822 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200823 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100824 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200825 if (!wstr) {
826 PyErr_NoMemory();
827 return -1;
828 }
829 _PyUnicode_WSTR(unicode) = wstr;
830 _PyUnicode_WSTR(unicode)[length] = 0;
831 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200832 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000833 return 0;
834}
835
Victor Stinnerfe226c02011-10-03 03:52:20 +0200836static PyObject*
837resize_copy(PyObject *unicode, Py_ssize_t length)
838{
839 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100840 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200841 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100842
Benjamin Petersonbac79492012-01-14 13:34:47 -0500843 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100844 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200845
846 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
847 if (copy == NULL)
848 return NULL;
849
850 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200851 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200852 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200853 }
854 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200855 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100856
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200857 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200858 if (w == NULL)
859 return NULL;
860 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
861 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200862 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
863 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200864 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200865 }
866}
867
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000869 Ux0000 terminated; some code (e.g. new_identifier)
870 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871
872 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000873 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874
875*/
876
Alexander Belopolsky40018472011-02-26 01:02:56 +0000877static PyUnicodeObject *
878_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879{
880 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882
Thomas Wouters477c8d52006-05-27 19:21:47 +0000883 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 if (length == 0 && unicode_empty != NULL) {
885 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200886 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000887 }
888
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000889 /* Ensure we won't overflow the size. */
890 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
891 return (PyUnicodeObject *)PyErr_NoMemory();
892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200893 if (length < 0) {
894 PyErr_SetString(PyExc_SystemError,
895 "Negative size passed to _PyUnicode_New");
896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000897 }
898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
900 if (unicode == NULL)
901 return NULL;
902 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
903 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
904 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100905 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000906 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100907 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200909
Jeremy Hyltond8082792003-09-16 19:41:39 +0000910 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000911 * the caller fails before initializing str -- unicode_resize()
912 * reads str[0], and the Keep-Alive optimization can keep memory
913 * allocated for str alive across a call to unicode_dealloc(unicode).
914 * We don't want unicode_resize to read uninitialized memory in
915 * that case.
916 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode)[0] = 0;
918 _PyUnicode_WSTR(unicode)[length] = 0;
919 _PyUnicode_WSTR_LENGTH(unicode) = length;
920 _PyUnicode_HASH(unicode) = -1;
921 _PyUnicode_STATE(unicode).interned = 0;
922 _PyUnicode_STATE(unicode).kind = 0;
923 _PyUnicode_STATE(unicode).compact = 0;
924 _PyUnicode_STATE(unicode).ready = 0;
925 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200926 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200928 _PyUnicode_UTF8(unicode) = NULL;
929 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100930 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000931 return unicode;
932}
933
Victor Stinnerf42dc442011-10-02 23:33:16 +0200934static const char*
935unicode_kind_name(PyObject *unicode)
936{
Victor Stinner42dfd712011-10-03 14:41:45 +0200937 /* don't check consistency: unicode_kind_name() is called from
938 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200939 if (!PyUnicode_IS_COMPACT(unicode))
940 {
941 if (!PyUnicode_IS_READY(unicode))
942 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600943 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200944 {
945 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200946 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200947 return "legacy ascii";
948 else
949 return "legacy latin1";
950 case PyUnicode_2BYTE_KIND:
951 return "legacy UCS2";
952 case PyUnicode_4BYTE_KIND:
953 return "legacy UCS4";
954 default:
955 return "<legacy invalid kind>";
956 }
957 }
958 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600959 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200960 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200961 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200962 return "ascii";
963 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200965 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200966 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200967 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 default:
970 return "<invalid compact kind>";
971 }
972}
973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975/* Functions wrapping macros for use in debugger */
976char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200977 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978}
979
980void *_PyUnicode_compact_data(void *unicode) {
981 return _PyUnicode_COMPACT_DATA(unicode);
982}
983void *_PyUnicode_data(void *unicode){
984 printf("obj %p\n", unicode);
985 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
986 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
987 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
988 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
989 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
990 return PyUnicode_DATA(unicode);
991}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200992
993void
994_PyUnicode_Dump(PyObject *op)
995{
996 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200997 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
998 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
999 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001000
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001002 {
1003 if (ascii->state.ascii)
1004 data = (ascii + 1);
1005 else
1006 data = (compact + 1);
1007 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001008 else
1009 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001010 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1011
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 if (ascii->wstr == data)
1013 printf("shared ");
1014 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001015
Victor Stinnera3b334d2011-10-03 13:53:37 +02001016 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 printf(" (%zu), ", compact->wstr_length);
1018 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1019 printf("shared ");
1020 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001021 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001022 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001023}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001024#endif
1025
1026PyObject *
1027PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1028{
1029 PyObject *obj;
1030 PyCompactUnicodeObject *unicode;
1031 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001032 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001033 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 Py_ssize_t char_size;
1035 Py_ssize_t struct_size;
1036
1037 /* Optimization for empty strings */
1038 if (size == 0 && unicode_empty != NULL) {
1039 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001040 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 }
1042
Victor Stinner9e9d6892011-10-04 01:02:02 +02001043 is_ascii = 0;
1044 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 struct_size = sizeof(PyCompactUnicodeObject);
1046 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001047 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 char_size = 1;
1049 is_ascii = 1;
1050 struct_size = sizeof(PyASCIIObject);
1051 }
1052 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001053 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 char_size = 1;
1055 }
1056 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 2;
1059 if (sizeof(wchar_t) == 2)
1060 is_sharing = 1;
1061 }
1062 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001063 if (maxchar > MAX_UNICODE) {
1064 PyErr_SetString(PyExc_SystemError,
1065 "invalid maximum character passed to PyUnicode_New");
1066 return NULL;
1067 }
Victor Stinner8f825062012-04-27 13:55:39 +02001068 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 char_size = 4;
1070 if (sizeof(wchar_t) == 4)
1071 is_sharing = 1;
1072 }
1073
1074 /* Ensure we won't overflow the size. */
1075 if (size < 0) {
1076 PyErr_SetString(PyExc_SystemError,
1077 "Negative size passed to PyUnicode_New");
1078 return NULL;
1079 }
1080 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1081 return PyErr_NoMemory();
1082
1083 /* Duplicated allocation code from _PyObject_New() instead of a call to
1084 * PyObject_New() so we are able to allocate space for the object and
1085 * it's data buffer.
1086 */
1087 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1088 if (obj == NULL)
1089 return PyErr_NoMemory();
1090 obj = PyObject_INIT(obj, &PyUnicode_Type);
1091 if (obj == NULL)
1092 return NULL;
1093
1094 unicode = (PyCompactUnicodeObject *)obj;
1095 if (is_ascii)
1096 data = ((PyASCIIObject*)obj) + 1;
1097 else
1098 data = unicode + 1;
1099 _PyUnicode_LENGTH(unicode) = size;
1100 _PyUnicode_HASH(unicode) = -1;
1101 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001102 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 _PyUnicode_STATE(unicode).compact = 1;
1104 _PyUnicode_STATE(unicode).ready = 1;
1105 _PyUnicode_STATE(unicode).ascii = is_ascii;
1106 if (is_ascii) {
1107 ((char*)data)[size] = 0;
1108 _PyUnicode_WSTR(unicode) = NULL;
1109 }
Victor Stinner8f825062012-04-27 13:55:39 +02001110 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001115 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 else {
1118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001122 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 ((Py_UCS4*)data)[size] = 0;
1124 if (is_sharing) {
1125 _PyUnicode_WSTR_LENGTH(unicode) = size;
1126 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1127 }
1128 else {
1129 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1130 _PyUnicode_WSTR(unicode) = NULL;
1131 }
1132 }
Victor Stinner8f825062012-04-27 13:55:39 +02001133#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001134 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001135#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001136 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 return obj;
1138}
1139
1140#if SIZEOF_WCHAR_T == 2
1141/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1142 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001143 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144
1145 This function assumes that unicode can hold one more code point than wstr
1146 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001147static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001149 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150{
1151 const wchar_t *iter;
1152 Py_UCS4 *ucs4_out;
1153
Victor Stinner910337b2011-10-03 03:20:16 +02001154 assert(unicode != NULL);
1155 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1157 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1158
1159 for (iter = begin; iter < end; ) {
1160 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1161 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001162 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1163 && (iter+1) < end
1164 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 {
Victor Stinner551ac952011-11-29 22:58:13 +01001166 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 iter += 2;
1168 }
1169 else {
1170 *ucs4_out++ = *iter;
1171 iter++;
1172 }
1173 }
1174 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1175 _PyUnicode_GET_LENGTH(unicode)));
1176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001177}
1178#endif
1179
Victor Stinnercd9950f2011-10-02 00:34:53 +02001180static int
Victor Stinner488fa492011-12-12 00:01:39 +01001181unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001182{
Victor Stinner488fa492011-12-12 00:01:39 +01001183 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001184 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001185 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186 return -1;
1187 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001188 return 0;
1189}
1190
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001191static int
1192_copy_characters(PyObject *to, Py_ssize_t to_start,
1193 PyObject *from, Py_ssize_t from_start,
1194 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001195{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001196 unsigned int from_kind, to_kind;
1197 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198
Victor Stinneree4544c2012-05-09 22:24:08 +02001199 assert(0 <= how_many);
1200 assert(0 <= from_start);
1201 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001203 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001204 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205
Victor Stinnerd3f08822012-05-29 12:57:52 +02001206 assert(PyUnicode_Check(to));
1207 assert(PyUnicode_IS_READY(to));
1208 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1209
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001210 if (how_many == 0)
1211 return 0;
1212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001214 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001215 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217
Victor Stinnerf1852262012-06-16 16:38:26 +02001218#ifdef Py_DEBUG
1219 if (!check_maxchar
1220 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1221 {
1222 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1223 Py_UCS4 ch;
1224 Py_ssize_t i;
1225 for (i=0; i < how_many; i++) {
1226 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1227 assert(ch <= to_maxchar);
1228 }
1229 }
1230#endif
1231
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001233 if (check_maxchar
1234 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1235 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 /* Writing Latin-1 characters into an ASCII string requires to
1237 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001238 Py_UCS4 max_char;
1239 max_char = ucs1lib_find_max_char(from_data,
1240 (Py_UCS1*)from_data + how_many);
1241 if (max_char >= 128)
1242 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001243 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001244 Py_MEMCPY((char*)to_data + to_kind * to_start,
1245 (char*)from_data + from_kind * from_start,
1246 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001248 else if (from_kind == PyUnicode_1BYTE_KIND
1249 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001250 {
1251 _PyUnicode_CONVERT_BYTES(
1252 Py_UCS1, Py_UCS2,
1253 PyUnicode_1BYTE_DATA(from) + from_start,
1254 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1255 PyUnicode_2BYTE_DATA(to) + to_start
1256 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001257 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001258 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001259 && to_kind == PyUnicode_4BYTE_KIND)
1260 {
1261 _PyUnicode_CONVERT_BYTES(
1262 Py_UCS1, Py_UCS4,
1263 PyUnicode_1BYTE_DATA(from) + from_start,
1264 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1265 PyUnicode_4BYTE_DATA(to) + to_start
1266 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001267 }
1268 else if (from_kind == PyUnicode_2BYTE_KIND
1269 && to_kind == PyUnicode_4BYTE_KIND)
1270 {
1271 _PyUnicode_CONVERT_BYTES(
1272 Py_UCS2, Py_UCS4,
1273 PyUnicode_2BYTE_DATA(from) + from_start,
1274 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1275 PyUnicode_4BYTE_DATA(to) + to_start
1276 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001277 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001278 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001279 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1280
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001281 if (!check_maxchar) {
1282 if (from_kind == PyUnicode_2BYTE_KIND
1283 && to_kind == PyUnicode_1BYTE_KIND)
1284 {
1285 _PyUnicode_CONVERT_BYTES(
1286 Py_UCS2, Py_UCS1,
1287 PyUnicode_2BYTE_DATA(from) + from_start,
1288 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1289 PyUnicode_1BYTE_DATA(to) + to_start
1290 );
1291 }
1292 else if (from_kind == PyUnicode_4BYTE_KIND
1293 && to_kind == PyUnicode_1BYTE_KIND)
1294 {
1295 _PyUnicode_CONVERT_BYTES(
1296 Py_UCS4, Py_UCS1,
1297 PyUnicode_4BYTE_DATA(from) + from_start,
1298 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1299 PyUnicode_1BYTE_DATA(to) + to_start
1300 );
1301 }
1302 else if (from_kind == PyUnicode_4BYTE_KIND
1303 && to_kind == PyUnicode_2BYTE_KIND)
1304 {
1305 _PyUnicode_CONVERT_BYTES(
1306 Py_UCS4, Py_UCS2,
1307 PyUnicode_4BYTE_DATA(from) + from_start,
1308 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1309 PyUnicode_2BYTE_DATA(to) + to_start
1310 );
1311 }
1312 else {
1313 assert(0);
1314 return -1;
1315 }
1316 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001317 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001318 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001319 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001320 Py_ssize_t i;
1321
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 for (i=0; i < how_many; i++) {
1323 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001324 if (ch > to_maxchar)
1325 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1327 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001328 }
1329 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 return 0;
1331}
1332
Victor Stinnerd3f08822012-05-29 12:57:52 +02001333void
1334_PyUnicode_FastCopyCharacters(
1335 PyObject *to, Py_ssize_t to_start,
1336 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001337{
1338 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1339}
1340
1341Py_ssize_t
1342PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1343 PyObject *from, Py_ssize_t from_start,
1344 Py_ssize_t how_many)
1345{
1346 int err;
1347
1348 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1349 PyErr_BadInternalCall();
1350 return -1;
1351 }
1352
Benjamin Petersonbac79492012-01-14 13:34:47 -05001353 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001354 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001355 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001356 return -1;
1357
Victor Stinnerd3f08822012-05-29 12:57:52 +02001358 if (from_start < 0) {
1359 PyErr_SetString(PyExc_IndexError, "string index out of range");
1360 return -1;
1361 }
1362 if (to_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001366 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1367 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1368 PyErr_Format(PyExc_SystemError,
1369 "Cannot write %zi characters at %zi "
1370 "in a string of %zi characters",
1371 how_many, to_start, PyUnicode_GET_LENGTH(to));
1372 return -1;
1373 }
1374
1375 if (how_many == 0)
1376 return 0;
1377
Victor Stinner488fa492011-12-12 00:01:39 +01001378 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001379 return -1;
1380
1381 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1382 if (err) {
1383 PyErr_Format(PyExc_SystemError,
1384 "Cannot copy %s characters "
1385 "into a string of %s characters",
1386 unicode_kind_name(from),
1387 unicode_kind_name(to));
1388 return -1;
1389 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391}
1392
Victor Stinner17222162011-09-28 22:15:37 +02001393/* Find the maximum code point and count the number of surrogate pairs so a
1394 correct string length can be computed before converting a string to UCS4.
1395 This function counts single surrogates as a character and not as a pair.
1396
1397 Return 0 on success, or -1 on error. */
1398static int
1399find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1400 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401{
1402 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001403 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404
Victor Stinnerc53be962011-10-02 21:33:54 +02001405 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 *num_surrogates = 0;
1407 *maxchar = 0;
1408
1409 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001411 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1412 && (iter+1) < end
1413 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1414 {
1415 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1416 ++(*num_surrogates);
1417 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 }
1419 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001421 {
1422 ch = *iter;
1423 iter++;
1424 }
1425 if (ch > *maxchar) {
1426 *maxchar = ch;
1427 if (*maxchar > MAX_UNICODE) {
1428 PyErr_Format(PyExc_ValueError,
1429 "character U+%x is not in range [U+0000; U+10ffff]",
1430 ch);
1431 return -1;
1432 }
1433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
1435 return 0;
1436}
1437
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001438int
1439_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440{
1441 wchar_t *end;
1442 Py_UCS4 maxchar = 0;
1443 Py_ssize_t num_surrogates;
1444#if SIZEOF_WCHAR_T == 2
1445 Py_ssize_t length_wo_surrogates;
1446#endif
1447
Georg Brandl7597add2011-10-05 16:36:47 +02001448 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001449 strings were created using _PyObject_New() and where no canonical
1450 representation (the str field) has been set yet aka strings
1451 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001452 assert(_PyUnicode_CHECK(unicode));
1453 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001455 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001457 /* Actually, it should neither be interned nor be anything else: */
1458 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001461 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001462 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464
1465 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1467 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 PyErr_NoMemory();
1469 return -1;
1470 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001471 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 _PyUnicode_WSTR(unicode), end,
1473 PyUnicode_1BYTE_DATA(unicode));
1474 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1475 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1476 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1477 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001478 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001479 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001480 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
1482 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001483 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8(unicode) = NULL;
1485 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 }
1487 PyObject_FREE(_PyUnicode_WSTR(unicode));
1488 _PyUnicode_WSTR(unicode) = NULL;
1489 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1490 }
1491 /* In this case we might have to convert down from 4-byte native
1492 wchar_t to 2-byte unicode. */
1493 else if (maxchar < 65536) {
1494 assert(num_surrogates == 0 &&
1495 "FindMaxCharAndNumSurrogatePairs() messed up");
1496
Victor Stinner506f5922011-09-28 22:34:18 +02001497#if SIZEOF_WCHAR_T == 2
1498 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001499 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001500 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1501 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1502 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001503 _PyUnicode_UTF8(unicode) = NULL;
1504 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001505#else
1506 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001507 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001508 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001509 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001510 PyErr_NoMemory();
1511 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 }
Victor Stinner506f5922011-09-28 22:34:18 +02001513 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1514 _PyUnicode_WSTR(unicode), end,
1515 PyUnicode_2BYTE_DATA(unicode));
1516 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1517 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1518 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001519 _PyUnicode_UTF8(unicode) = NULL;
1520 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001521 PyObject_FREE(_PyUnicode_WSTR(unicode));
1522 _PyUnicode_WSTR(unicode) = NULL;
1523 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1524#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525 }
1526 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1527 else {
1528#if SIZEOF_WCHAR_T == 2
1529 /* in case the native representation is 2-bytes, we need to allocate a
1530 new normalized 4-byte version. */
1531 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001532 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1533 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 PyErr_NoMemory();
1535 return -1;
1536 }
1537 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1538 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001539 _PyUnicode_UTF8(unicode) = NULL;
1540 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001541 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1542 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001543 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544 PyObject_FREE(_PyUnicode_WSTR(unicode));
1545 _PyUnicode_WSTR(unicode) = NULL;
1546 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1547#else
1548 assert(num_surrogates == 0);
1549
Victor Stinnerc3c74152011-10-02 20:39:55 +02001550 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001552 _PyUnicode_UTF8(unicode) = NULL;
1553 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1555#endif
1556 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1557 }
1558 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001559 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 return 0;
1561}
1562
Alexander Belopolsky40018472011-02-26 01:02:56 +00001563static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001564unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565{
Walter Dörwald16807132007-05-25 13:52:07 +00001566 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 case SSTATE_NOT_INTERNED:
1568 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001569
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 case SSTATE_INTERNED_MORTAL:
1571 /* revive dead object temporarily for DelItem */
1572 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001573 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 Py_FatalError(
1575 "deletion of interned string failed");
1576 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001577
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 case SSTATE_INTERNED_IMMORTAL:
1579 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001580
Benjamin Peterson29060642009-01-31 22:14:21 +00001581 default:
1582 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001583 }
1584
Victor Stinner03490912011-10-03 23:45:12 +02001585 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001587 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001588 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001589 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1590 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001592 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593}
1594
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001595#ifdef Py_DEBUG
1596static int
1597unicode_is_singleton(PyObject *unicode)
1598{
1599 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1600 if (unicode == unicode_empty)
1601 return 1;
1602 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1603 {
1604 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1605 if (ch < 256 && unicode_latin1[ch] == unicode)
1606 return 1;
1607 }
1608 return 0;
1609}
1610#endif
1611
Alexander Belopolsky40018472011-02-26 01:02:56 +00001612static int
Victor Stinner488fa492011-12-12 00:01:39 +01001613unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001614{
Victor Stinner488fa492011-12-12 00:01:39 +01001615 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001616 if (Py_REFCNT(unicode) != 1)
1617 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001618 if (_PyUnicode_HASH(unicode) != -1)
1619 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_CHECK_INTERNED(unicode))
1621 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001622 if (!PyUnicode_CheckExact(unicode))
1623 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001624#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001625 /* singleton refcount is greater than 1 */
1626 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001627#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 return 1;
1629}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001630
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631static int
1632unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1633{
1634 PyObject *unicode;
1635 Py_ssize_t old_length;
1636
1637 assert(p_unicode != NULL);
1638 unicode = *p_unicode;
1639
1640 assert(unicode != NULL);
1641 assert(PyUnicode_Check(unicode));
1642 assert(0 <= length);
1643
Victor Stinner910337b2011-10-03 03:20:16 +02001644 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001645 old_length = PyUnicode_WSTR_LENGTH(unicode);
1646 else
1647 old_length = PyUnicode_GET_LENGTH(unicode);
1648 if (old_length == length)
1649 return 0;
1650
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001651 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001652 _Py_INCREF_UNICODE_EMPTY();
1653 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001654 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 Py_DECREF(*p_unicode);
1656 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001657 return 0;
1658 }
1659
Victor Stinner488fa492011-12-12 00:01:39 +01001660 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001661 PyObject *copy = resize_copy(unicode, length);
1662 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001663 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001664 Py_DECREF(*p_unicode);
1665 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001666 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001667 }
1668
Victor Stinnerfe226c02011-10-03 03:52:20 +02001669 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001670 PyObject *new_unicode = resize_compact(unicode, length);
1671 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001672 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001673 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001674 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001675 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001676 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001677}
1678
Alexander Belopolsky40018472011-02-26 01:02:56 +00001679int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001681{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001682 PyObject *unicode;
1683 if (p_unicode == NULL) {
1684 PyErr_BadInternalCall();
1685 return -1;
1686 }
1687 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001688 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001689 {
1690 PyErr_BadInternalCall();
1691 return -1;
1692 }
1693 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001694}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001695
Victor Stinnerc5166102012-02-22 13:55:02 +01001696/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001697
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001698 WARNING: The function doesn't copy the terminating null character and
1699 doesn't check the maximum character (may write a latin1 character in an
1700 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001701static void
1702unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1703 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001704{
1705 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1706 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001707 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001708
1709 switch (kind) {
1710 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001711 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001712#ifdef Py_DEBUG
1713 if (PyUnicode_IS_ASCII(unicode)) {
1714 Py_UCS4 maxchar = ucs1lib_find_max_char(
1715 (const Py_UCS1*)str,
1716 (const Py_UCS1*)str + len);
1717 assert(maxchar < 128);
1718 }
1719#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001720 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001721 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001722 }
1723 case PyUnicode_2BYTE_KIND: {
1724 Py_UCS2 *start = (Py_UCS2 *)data + index;
1725 Py_UCS2 *ucs2 = start;
1726 assert(index <= PyUnicode_GET_LENGTH(unicode));
1727
Victor Stinner184252a2012-06-16 02:57:41 +02001728 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001729 *ucs2 = (Py_UCS2)*str;
1730
1731 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001732 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 }
1734 default: {
1735 Py_UCS4 *start = (Py_UCS4 *)data + index;
1736 Py_UCS4 *ucs4 = start;
1737 assert(kind == PyUnicode_4BYTE_KIND);
1738 assert(index <= PyUnicode_GET_LENGTH(unicode));
1739
Victor Stinner184252a2012-06-16 02:57:41 +02001740 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001741 *ucs4 = (Py_UCS4)*str;
1742
1743 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001744 }
1745 }
1746}
1747
1748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749static PyObject*
1750get_latin1_char(unsigned char ch)
1751{
Victor Stinnera464fc12011-10-02 20:39:30 +02001752 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001754 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 if (!unicode)
1756 return NULL;
1757 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001758 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 unicode_latin1[ch] = unicode;
1760 }
1761 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001762 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765PyObject *
1766PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001768 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 Py_UCS4 maxchar = 0;
1770 Py_ssize_t num_surrogates;
1771
1772 if (u == NULL)
1773 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001775 /* If the Unicode data is known at construction time, we can apply
1776 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001779 if (size == 0)
1780 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 /* Single character Unicode objects in the Latin-1 range are
1783 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001784 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 return get_latin1_char((unsigned char)*u);
1786
1787 /* If not empty and not single character, copy the Unicode data
1788 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001789 if (find_maxchar_surrogates(u, u + size,
1790 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 return NULL;
1792
Victor Stinner8faf8212011-12-08 22:14:11 +01001793 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 if (!unicode)
1795 return NULL;
1796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 switch (PyUnicode_KIND(unicode)) {
1798 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001799 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1801 break;
1802 case PyUnicode_2BYTE_KIND:
1803#if Py_UNICODE_SIZE == 2
1804 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1805#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001806 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1808#endif
1809 break;
1810 case PyUnicode_4BYTE_KIND:
1811#if SIZEOF_WCHAR_T == 2
1812 /* This is the only case which has to process surrogates, thus
1813 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001814 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815#else
1816 assert(num_surrogates == 0);
1817 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1818#endif
1819 break;
1820 default:
1821 assert(0 && "Impossible state");
1822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001824 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825}
1826
Alexander Belopolsky40018472011-02-26 01:02:56 +00001827PyObject *
1828PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001829{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001830 if (size < 0) {
1831 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001832 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001833 return NULL;
1834 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001835 if (u != NULL)
1836 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1837 else
1838 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001839}
1840
Alexander Belopolsky40018472011-02-26 01:02:56 +00001841PyObject *
1842PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001843{
1844 size_t size = strlen(u);
1845 if (size > PY_SSIZE_T_MAX) {
1846 PyErr_SetString(PyExc_OverflowError, "input too long");
1847 return NULL;
1848 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001849 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001850}
1851
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001852PyObject *
1853_PyUnicode_FromId(_Py_Identifier *id)
1854{
1855 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001856 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1857 strlen(id->string),
1858 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001859 if (!id->object)
1860 return NULL;
1861 PyUnicode_InternInPlace(&id->object);
1862 assert(!id->next);
1863 id->next = static_strings;
1864 static_strings = id;
1865 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001866 return id->object;
1867}
1868
1869void
1870_PyUnicode_ClearStaticStrings()
1871{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001872 _Py_Identifier *tmp, *s = static_strings;
1873 while (s) {
1874 Py_DECREF(s->object);
1875 s->object = NULL;
1876 tmp = s->next;
1877 s->next = NULL;
1878 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001879 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001880 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001881}
1882
Benjamin Peterson0df54292012-03-26 14:50:32 -04001883/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001884
Victor Stinnerd3f08822012-05-29 12:57:52 +02001885PyObject*
1886_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001887{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001888 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001889 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001890 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001891#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001892 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001893#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001894 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001895 }
Victor Stinner785938e2011-12-11 20:09:03 +01001896 unicode = PyUnicode_New(size, 127);
1897 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001898 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001899 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1900 assert(_PyUnicode_CheckConsistency(unicode, 1));
1901 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001902}
1903
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001904static Py_UCS4
1905kind_maxchar_limit(unsigned int kind)
1906{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001907 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001908 case PyUnicode_1BYTE_KIND:
1909 return 0x80;
1910 case PyUnicode_2BYTE_KIND:
1911 return 0x100;
1912 case PyUnicode_4BYTE_KIND:
1913 return 0x10000;
1914 default:
1915 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001916 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001917 }
1918}
1919
Victor Stinnere6abb482012-05-02 01:15:40 +02001920Py_LOCAL_INLINE(Py_UCS4)
1921align_maxchar(Py_UCS4 maxchar)
1922{
1923 if (maxchar <= 127)
1924 return 127;
1925 else if (maxchar <= 255)
1926 return 255;
1927 else if (maxchar <= 65535)
1928 return 65535;
1929 else
1930 return MAX_UNICODE;
1931}
1932
Victor Stinner702c7342011-10-05 13:50:52 +02001933static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001934_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001935{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001937 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938
Serhiy Storchaka678db842013-01-26 12:16:36 +02001939 if (size == 0)
1940 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001941 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001942 if (size == 1)
1943 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001944
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001945 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001946 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 if (!res)
1948 return NULL;
1949 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001950 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001951 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001952}
1953
Victor Stinnere57b1c02011-09-28 22:20:48 +02001954static PyObject*
1955_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956{
1957 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001958 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001959
Serhiy Storchaka678db842013-01-26 12:16:36 +02001960 if (size == 0)
1961 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001962 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001963 if (size == 1) {
1964 Py_UCS4 ch = u[0];
1965 if (ch < 256)
1966 return get_latin1_char((unsigned char)ch);
1967
1968 res = PyUnicode_New(1, ch);
1969 if (res == NULL)
1970 return NULL;
1971 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1972 assert(_PyUnicode_CheckConsistency(res, 1));
1973 return res;
1974 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001977 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (!res)
1979 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001982 else {
1983 _PyUnicode_CONVERT_BYTES(
1984 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1985 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001986 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 return res;
1988}
1989
Victor Stinnere57b1c02011-09-28 22:20:48 +02001990static PyObject*
1991_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992{
1993 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001994 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001995
Serhiy Storchaka678db842013-01-26 12:16:36 +02001996 if (size == 0)
1997 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001998 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001999 if (size == 1) {
2000 Py_UCS4 ch = u[0];
2001 if (ch < 256)
2002 return get_latin1_char((unsigned char)ch);
2003
2004 res = PyUnicode_New(1, ch);
2005 if (res == NULL)
2006 return NULL;
2007 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
2008 assert(_PyUnicode_CheckConsistency(res, 1));
2009 return res;
2010 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002011
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002012 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002013 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 if (!res)
2015 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002016 if (max_char < 256)
2017 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2018 PyUnicode_1BYTE_DATA(res));
2019 else if (max_char < 0x10000)
2020 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2021 PyUnicode_2BYTE_DATA(res));
2022 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002024 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 return res;
2026}
2027
2028PyObject*
2029PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2030{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002031 if (size < 0) {
2032 PyErr_SetString(PyExc_ValueError, "size must be positive");
2033 return NULL;
2034 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002035 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002037 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002039 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002041 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002042 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002043 PyErr_SetString(PyExc_SystemError, "invalid kind");
2044 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046}
2047
Victor Stinnerece58de2012-04-23 23:36:38 +02002048Py_UCS4
2049_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2050{
2051 enum PyUnicode_Kind kind;
2052 void *startptr, *endptr;
2053
2054 assert(PyUnicode_IS_READY(unicode));
2055 assert(0 <= start);
2056 assert(end <= PyUnicode_GET_LENGTH(unicode));
2057 assert(start <= end);
2058
2059 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2060 return PyUnicode_MAX_CHAR_VALUE(unicode);
2061
2062 if (start == end)
2063 return 127;
2064
Victor Stinner94d558b2012-04-27 22:26:58 +02002065 if (PyUnicode_IS_ASCII(unicode))
2066 return 127;
2067
Victor Stinnerece58de2012-04-23 23:36:38 +02002068 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002069 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002070 endptr = (char *)startptr + end * kind;
2071 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002072 switch(kind) {
2073 case PyUnicode_1BYTE_KIND:
2074 return ucs1lib_find_max_char(startptr, endptr);
2075 case PyUnicode_2BYTE_KIND:
2076 return ucs2lib_find_max_char(startptr, endptr);
2077 case PyUnicode_4BYTE_KIND:
2078 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002079 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002080 assert(0);
2081 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002082 }
2083}
2084
Victor Stinner25a4b292011-10-06 12:31:55 +02002085/* Ensure that a string uses the most efficient storage, if it is not the
2086 case: create a new string with of the right kind. Write NULL into *p_unicode
2087 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002088static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002089unicode_adjust_maxchar(PyObject **p_unicode)
2090{
2091 PyObject *unicode, *copy;
2092 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002093 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002094 unsigned int kind;
2095
2096 assert(p_unicode != NULL);
2097 unicode = *p_unicode;
2098 assert(PyUnicode_IS_READY(unicode));
2099 if (PyUnicode_IS_ASCII(unicode))
2100 return;
2101
2102 len = PyUnicode_GET_LENGTH(unicode);
2103 kind = PyUnicode_KIND(unicode);
2104 if (kind == PyUnicode_1BYTE_KIND) {
2105 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002106 max_char = ucs1lib_find_max_char(u, u + len);
2107 if (max_char >= 128)
2108 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002109 }
2110 else if (kind == PyUnicode_2BYTE_KIND) {
2111 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002112 max_char = ucs2lib_find_max_char(u, u + len);
2113 if (max_char >= 256)
2114 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002115 }
2116 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002117 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002118 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002119 max_char = ucs4lib_find_max_char(u, u + len);
2120 if (max_char >= 0x10000)
2121 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002123 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002124 if (copy != NULL)
2125 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002126 Py_DECREF(unicode);
2127 *p_unicode = copy;
2128}
2129
Victor Stinner034f6cf2011-09-30 02:26:44 +02002130PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002131_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002132{
Victor Stinner87af4f22011-11-21 23:03:47 +01002133 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002134 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002135
Victor Stinner034f6cf2011-09-30 02:26:44 +02002136 if (!PyUnicode_Check(unicode)) {
2137 PyErr_BadInternalCall();
2138 return NULL;
2139 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002140 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002141 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002142
Victor Stinner87af4f22011-11-21 23:03:47 +01002143 length = PyUnicode_GET_LENGTH(unicode);
2144 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145 if (!copy)
2146 return NULL;
2147 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2148
Victor Stinner87af4f22011-11-21 23:03:47 +01002149 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2150 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002151 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153}
2154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155
Victor Stinnerbc603d12011-10-02 01:00:40 +02002156/* Widen Unicode objects to larger buffers. Don't write terminating null
2157 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158
2159void*
2160_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2161{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002162 Py_ssize_t len;
2163 void *result;
2164 unsigned int skind;
2165
Benjamin Petersonbac79492012-01-14 13:34:47 -05002166 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167 return NULL;
2168
2169 len = PyUnicode_GET_LENGTH(s);
2170 skind = PyUnicode_KIND(s);
2171 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002172 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return NULL;
2174 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002175 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002176 case PyUnicode_2BYTE_KIND:
2177 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2178 if (!result)
2179 return PyErr_NoMemory();
2180 assert(skind == PyUnicode_1BYTE_KIND);
2181 _PyUnicode_CONVERT_BYTES(
2182 Py_UCS1, Py_UCS2,
2183 PyUnicode_1BYTE_DATA(s),
2184 PyUnicode_1BYTE_DATA(s) + len,
2185 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002187 case PyUnicode_4BYTE_KIND:
2188 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2189 if (!result)
2190 return PyErr_NoMemory();
2191 if (skind == PyUnicode_2BYTE_KIND) {
2192 _PyUnicode_CONVERT_BYTES(
2193 Py_UCS2, Py_UCS4,
2194 PyUnicode_2BYTE_DATA(s),
2195 PyUnicode_2BYTE_DATA(s) + len,
2196 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002198 else {
2199 assert(skind == PyUnicode_1BYTE_KIND);
2200 _PyUnicode_CONVERT_BYTES(
2201 Py_UCS1, Py_UCS4,
2202 PyUnicode_1BYTE_DATA(s),
2203 PyUnicode_1BYTE_DATA(s) + len,
2204 result);
2205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002207 default:
2208 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 }
Victor Stinner01698042011-10-04 00:04:26 +02002210 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 return NULL;
2212}
2213
2214static Py_UCS4*
2215as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2216 int copy_null)
2217{
2218 int kind;
2219 void *data;
2220 Py_ssize_t len, targetlen;
2221 if (PyUnicode_READY(string) == -1)
2222 return NULL;
2223 kind = PyUnicode_KIND(string);
2224 data = PyUnicode_DATA(string);
2225 len = PyUnicode_GET_LENGTH(string);
2226 targetlen = len;
2227 if (copy_null)
2228 targetlen++;
2229 if (!target) {
2230 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2231 PyErr_NoMemory();
2232 return NULL;
2233 }
2234 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2235 if (!target) {
2236 PyErr_NoMemory();
2237 return NULL;
2238 }
2239 }
2240 else {
2241 if (targetsize < targetlen) {
2242 PyErr_Format(PyExc_SystemError,
2243 "string is longer than the buffer");
2244 if (copy_null && 0 < targetsize)
2245 target[0] = 0;
2246 return NULL;
2247 }
2248 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002249 if (kind == PyUnicode_1BYTE_KIND) {
2250 Py_UCS1 *start = (Py_UCS1 *) data;
2251 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002253 else if (kind == PyUnicode_2BYTE_KIND) {
2254 Py_UCS2 *start = (Py_UCS2 *) data;
2255 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2256 }
2257 else {
2258 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 if (copy_null)
2262 target[len] = 0;
2263 return target;
2264}
2265
2266Py_UCS4*
2267PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2268 int copy_null)
2269{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002270 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 PyErr_BadInternalCall();
2272 return NULL;
2273 }
2274 return as_ucs4(string, target, targetsize, copy_null);
2275}
2276
2277Py_UCS4*
2278PyUnicode_AsUCS4Copy(PyObject *string)
2279{
2280 return as_ucs4(string, NULL, 0, 1);
2281}
2282
2283#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002284
Alexander Belopolsky40018472011-02-26 01:02:56 +00002285PyObject *
2286PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002290 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002291 PyErr_BadInternalCall();
2292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 }
2294
Martin v. Löwis790465f2008-04-05 20:41:37 +00002295 if (size == -1) {
2296 size = wcslen(w);
2297 }
2298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002299 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300}
2301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002303
Walter Dörwald346737f2007-05-31 10:44:43 +00002304static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002305makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002306 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002307{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002308 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002309 if (longflag)
2310 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002311 else if (longlongflag) {
2312 /* longlongflag should only ever be nonzero on machines with
2313 HAVE_LONG_LONG defined */
2314#ifdef HAVE_LONG_LONG
2315 char *f = PY_FORMAT_LONG_LONG;
2316 while (*f)
2317 *fmt++ = *f++;
2318#else
2319 /* we shouldn't ever get here */
2320 assert(0);
2321 *fmt++ = 'l';
2322#endif
2323 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002324 else if (size_tflag) {
2325 char *f = PY_FORMAT_SIZE_T;
2326 while (*f)
2327 *fmt++ = *f++;
2328 }
2329 *fmt++ = c;
2330 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002331}
2332
Victor Stinner15a11362012-10-06 23:48:20 +02002333/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002334 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2335 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2336#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002337
2338static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002339unicode_fromformat_arg(_PyUnicodeWriter *writer,
2340 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002341{
Victor Stinnere215d962012-10-06 23:03:36 +02002342 const char *p;
2343 Py_ssize_t len;
2344 int zeropad;
2345 int width;
2346 int precision;
2347 int longflag;
2348 int longlongflag;
2349 int size_tflag;
2350 int fill;
2351
2352 p = f;
2353 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002354 zeropad = 0;
2355 if (*f == '0') {
2356 zeropad = 1;
2357 f++;
2358 }
Victor Stinner96865452011-03-01 23:44:09 +00002359
2360 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002361 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002362 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002363 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2364 PyErr_SetString(PyExc_ValueError,
2365 "width too big");
2366 return NULL;
2367 }
Victor Stinnere215d962012-10-06 23:03:36 +02002368 width = (width*10) + (*f - '0');
2369 f++;
2370 }
Victor Stinner96865452011-03-01 23:44:09 +00002371 precision = 0;
2372 if (*f == '.') {
2373 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002374 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002375 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2376 PyErr_SetString(PyExc_ValueError,
2377 "precision too big");
2378 return NULL;
2379 }
Victor Stinnere215d962012-10-06 23:03:36 +02002380 precision = (precision*10) + (*f - '0');
2381 f++;
2382 }
Victor Stinner96865452011-03-01 23:44:09 +00002383 if (*f == '%') {
2384 /* "%.3%s" => f points to "3" */
2385 f--;
2386 }
2387 }
2388 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002389 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002390 f--;
2391 }
Victor Stinner96865452011-03-01 23:44:09 +00002392
2393 /* Handle %ld, %lu, %lld and %llu. */
2394 longflag = 0;
2395 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002396 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002397 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002398 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002399 longflag = 1;
2400 ++f;
2401 }
2402#ifdef HAVE_LONG_LONG
2403 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002404 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002405 longlongflag = 1;
2406 f += 2;
2407 }
2408#endif
2409 }
2410 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002411 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002412 size_tflag = 1;
2413 ++f;
2414 }
Victor Stinnere215d962012-10-06 23:03:36 +02002415
2416 if (f[1] == '\0')
2417 writer->overallocate = 0;
2418
2419 switch (*f) {
2420 case 'c':
2421 {
2422 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002423 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2424 PyErr_SetString(PyExc_ValueError,
2425 "character argument not in range(0x110000)");
2426 return NULL;
2427 }
Victor Stinnere215d962012-10-06 23:03:36 +02002428 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2429 return NULL;
2430 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2431 writer->pos++;
2432 break;
2433 }
2434
2435 case 'i':
2436 case 'd':
2437 case 'u':
2438 case 'x':
2439 {
2440 /* used by sprintf */
2441 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002442 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002443
2444 if (*f == 'u') {
2445 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2446
2447 if (longflag)
2448 len = sprintf(buffer, fmt,
2449 va_arg(*vargs, unsigned long));
2450#ifdef HAVE_LONG_LONG
2451 else if (longlongflag)
2452 len = sprintf(buffer, fmt,
2453 va_arg(*vargs, unsigned PY_LONG_LONG));
2454#endif
2455 else if (size_tflag)
2456 len = sprintf(buffer, fmt,
2457 va_arg(*vargs, size_t));
2458 else
2459 len = sprintf(buffer, fmt,
2460 va_arg(*vargs, unsigned int));
2461 }
2462 else if (*f == 'x') {
2463 makefmt(fmt, 0, 0, 0, 'x');
2464 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2465 }
2466 else {
2467 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2468
2469 if (longflag)
2470 len = sprintf(buffer, fmt,
2471 va_arg(*vargs, long));
2472#ifdef HAVE_LONG_LONG
2473 else if (longlongflag)
2474 len = sprintf(buffer, fmt,
2475 va_arg(*vargs, PY_LONG_LONG));
2476#endif
2477 else if (size_tflag)
2478 len = sprintf(buffer, fmt,
2479 va_arg(*vargs, Py_ssize_t));
2480 else
2481 len = sprintf(buffer, fmt,
2482 va_arg(*vargs, int));
2483 }
2484 assert(len >= 0);
2485
Victor Stinnere215d962012-10-06 23:03:36 +02002486 if (precision < len)
2487 precision = len;
2488 if (width > precision) {
2489 Py_UCS4 fillchar;
2490 fill = width - precision;
2491 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002492 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2493 return NULL;
2494 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2495 return NULL;
2496 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002497 }
Victor Stinner15a11362012-10-06 23:48:20 +02002498 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002499 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002500 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2501 return NULL;
2502 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2503 return NULL;
2504 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002505 }
Victor Stinner15a11362012-10-06 23:48:20 +02002506 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002507 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002508 break;
2509 }
2510
2511 case 'p':
2512 {
2513 char number[MAX_LONG_LONG_CHARS];
2514
2515 len = sprintf(number, "%p", va_arg(*vargs, void*));
2516 assert(len >= 0);
2517
2518 /* %p is ill-defined: ensure leading 0x. */
2519 if (number[1] == 'X')
2520 number[1] = 'x';
2521 else if (number[1] != 'x') {
2522 memmove(number + 2, number,
2523 strlen(number) + 1);
2524 number[0] = '0';
2525 number[1] = 'x';
2526 len += 2;
2527 }
2528
2529 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2530 return NULL;
2531 break;
2532 }
2533
2534 case 's':
2535 {
2536 /* UTF-8 */
2537 const char *s = va_arg(*vargs, const char*);
2538 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2539 if (!str)
2540 return NULL;
2541 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2542 Py_DECREF(str);
2543 return NULL;
2544 }
2545 Py_DECREF(str);
2546 break;
2547 }
2548
2549 case 'U':
2550 {
2551 PyObject *obj = va_arg(*vargs, PyObject *);
2552 assert(obj && _PyUnicode_CHECK(obj));
2553
2554 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2555 return NULL;
2556 break;
2557 }
2558
2559 case 'V':
2560 {
2561 PyObject *obj = va_arg(*vargs, PyObject *);
2562 const char *str = va_arg(*vargs, const char *);
2563 PyObject *str_obj;
2564 assert(obj || str);
2565 if (obj) {
2566 assert(_PyUnicode_CHECK(obj));
2567 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2568 return NULL;
2569 }
2570 else {
2571 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2572 if (!str_obj)
2573 return NULL;
2574 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2575 Py_DECREF(str_obj);
2576 return NULL;
2577 }
2578 Py_DECREF(str_obj);
2579 }
2580 break;
2581 }
2582
2583 case 'S':
2584 {
2585 PyObject *obj = va_arg(*vargs, PyObject *);
2586 PyObject *str;
2587 assert(obj);
2588 str = PyObject_Str(obj);
2589 if (!str)
2590 return NULL;
2591 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2592 Py_DECREF(str);
2593 return NULL;
2594 }
2595 Py_DECREF(str);
2596 break;
2597 }
2598
2599 case 'R':
2600 {
2601 PyObject *obj = va_arg(*vargs, PyObject *);
2602 PyObject *repr;
2603 assert(obj);
2604 repr = PyObject_Repr(obj);
2605 if (!repr)
2606 return NULL;
2607 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2608 Py_DECREF(repr);
2609 return NULL;
2610 }
2611 Py_DECREF(repr);
2612 break;
2613 }
2614
2615 case 'A':
2616 {
2617 PyObject *obj = va_arg(*vargs, PyObject *);
2618 PyObject *ascii;
2619 assert(obj);
2620 ascii = PyObject_ASCII(obj);
2621 if (!ascii)
2622 return NULL;
2623 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2624 Py_DECREF(ascii);
2625 return NULL;
2626 }
2627 Py_DECREF(ascii);
2628 break;
2629 }
2630
2631 case '%':
2632 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2633 return NULL;
2634 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2635 writer->pos++;
2636 break;
2637
2638 default:
2639 /* if we stumble upon an unknown formatting code, copy the rest
2640 of the format string to the output string. (we cannot just
2641 skip the code, since there's no way to know what's in the
2642 argument list) */
2643 len = strlen(p);
2644 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2645 return NULL;
2646 f = p+len;
2647 return f;
2648 }
2649
2650 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002651 return f;
2652}
2653
Walter Dörwaldd2034312007-05-18 16:29:38 +00002654PyObject *
2655PyUnicode_FromFormatV(const char *format, va_list vargs)
2656{
Victor Stinnere215d962012-10-06 23:03:36 +02002657 va_list vargs2;
2658 const char *f;
2659 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002660
Victor Stinnere215d962012-10-06 23:03:36 +02002661 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2662
2663 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2664 Copy it to be able to pass a reference to a subfunction. */
2665 Py_VA_COPY(vargs2, vargs);
2666
2667 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002669 f = unicode_fromformat_arg(&writer, f, &vargs2);
2670 if (f == NULL)
2671 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002674 const char *p;
2675 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002676
Victor Stinnere215d962012-10-06 23:03:36 +02002677 p = f;
2678 do
2679 {
2680 if ((unsigned char)*p > 127) {
2681 PyErr_Format(PyExc_ValueError,
2682 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2683 "string, got a non-ASCII byte: 0x%02x",
2684 (unsigned char)*p);
2685 return NULL;
2686 }
2687 p++;
2688 }
2689 while (*p != '\0' && *p != '%');
2690 len = p - f;
2691
2692 if (*p == '\0')
2693 writer.overallocate = 0;
2694 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2695 goto fail;
2696 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2697 writer.pos += len;
2698
2699 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002701 }
Victor Stinnere215d962012-10-06 23:03:36 +02002702 return _PyUnicodeWriter_Finish(&writer);
2703
2704 fail:
2705 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707}
2708
Walter Dörwaldd2034312007-05-18 16:29:38 +00002709PyObject *
2710PyUnicode_FromFormat(const char *format, ...)
2711{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 PyObject* ret;
2713 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002714
2715#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002720 ret = PyUnicode_FromFormatV(format, vargs);
2721 va_end(vargs);
2722 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002723}
2724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725#ifdef HAVE_WCHAR_H
2726
Victor Stinner5593d8a2010-10-02 11:11:27 +00002727/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2728 convert a Unicode object to a wide character string.
2729
Victor Stinnerd88d9832011-09-06 02:00:05 +02002730 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 character) required to convert the unicode object. Ignore size argument.
2732
Victor Stinnerd88d9832011-09-06 02:00:05 +02002733 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002735 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002737unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002738 wchar_t *w,
2739 Py_ssize_t size)
2740{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002741 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 const wchar_t *wstr;
2743
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002744 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 if (wstr == NULL)
2746 return -1;
2747
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002749 if (size > res)
2750 size = res + 1;
2751 else
2752 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002754 return res;
2755 }
2756 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002757 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002758}
2759
2760Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002761PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002762 wchar_t *w,
2763 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764{
2765 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 PyErr_BadInternalCall();
2767 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002769 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770}
2771
Victor Stinner137c34c2010-09-29 10:25:54 +00002772wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002773PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002774 Py_ssize_t *size)
2775{
2776 wchar_t* buffer;
2777 Py_ssize_t buflen;
2778
2779 if (unicode == NULL) {
2780 PyErr_BadInternalCall();
2781 return NULL;
2782 }
2783
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002784 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 if (buflen == -1)
2786 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002787 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002788 PyErr_NoMemory();
2789 return NULL;
2790 }
2791
Victor Stinner137c34c2010-09-29 10:25:54 +00002792 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2793 if (buffer == NULL) {
2794 PyErr_NoMemory();
2795 return NULL;
2796 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002797 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002798 if (buflen == -1) {
2799 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002801 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802 if (size != NULL)
2803 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002804 return buffer;
2805}
2806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808
Alexander Belopolsky40018472011-02-26 01:02:56 +00002809PyObject *
2810PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002813 void *data;
2814 int kind;
2815
Victor Stinner8faf8212011-12-08 22:14:11 +01002816 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002817 PyErr_SetString(PyExc_ValueError,
2818 "chr() arg not in range(0x110000)");
2819 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002820 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002821
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002822 if ((Py_UCS4)ordinal < 256)
2823 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 v = PyUnicode_New(1, ordinal);
2826 if (v == NULL)
2827 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002828 kind = PyUnicode_KIND(v);
2829 data = PyUnicode_DATA(v);
2830 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002831 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002833}
2834
Alexander Belopolsky40018472011-02-26 01:02:56 +00002835PyObject *
2836PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002838 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002840 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002841 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002842 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002843 Py_INCREF(obj);
2844 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002845 }
2846 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 /* For a Unicode subtype that's not a Unicode object,
2848 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002849 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002850 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002851 PyErr_Format(PyExc_TypeError,
2852 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002853 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002854 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002855}
2856
Alexander Belopolsky40018472011-02-26 01:02:56 +00002857PyObject *
2858PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002859 const char *encoding,
2860 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002861{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002863 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002864
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002866 PyErr_BadInternalCall();
2867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002869
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002870 /* Decoding bytes objects is the most common case and should be fast */
2871 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002872 if (PyBytes_GET_SIZE(obj) == 0)
2873 _Py_RETURN_UNICODE_EMPTY();
2874 v = PyUnicode_Decode(
2875 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2876 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 return v;
2878 }
2879
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002880 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 PyErr_SetString(PyExc_TypeError,
2882 "decoding str is not supported");
2883 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002884 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2887 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2888 PyErr_Format(PyExc_TypeError,
2889 "coercing to str: need bytes, bytearray "
2890 "or buffer-like object, %.80s found",
2891 Py_TYPE(obj)->tp_name);
2892 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002893 }
Tim Petersced69f82003-09-16 20:30:58 +00002894
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002896 PyBuffer_Release(&buffer);
2897 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002899
Serhiy Storchaka05997252013-01-26 12:14:02 +02002900 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002901 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002902 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903}
2904
Victor Stinner600d3be2010-06-10 12:00:55 +00002905/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002906 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2907 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002908int
2909_Py_normalize_encoding(const char *encoding,
2910 char *lower,
2911 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002913 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002914 char *l;
2915 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002917 if (encoding == NULL) {
2918 strcpy(lower, "utf-8");
2919 return 1;
2920 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002921 e = encoding;
2922 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002923 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002924 while (*e) {
2925 if (l == l_end)
2926 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002927 if (Py_ISUPPER(*e)) {
2928 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002929 }
2930 else if (*e == '_') {
2931 *l++ = '-';
2932 e++;
2933 }
2934 else {
2935 *l++ = *e++;
2936 }
2937 }
2938 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002939 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002940}
2941
Alexander Belopolsky40018472011-02-26 01:02:56 +00002942PyObject *
2943PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002944 Py_ssize_t size,
2945 const char *encoding,
2946 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002947{
2948 PyObject *buffer = NULL, *unicode;
2949 Py_buffer info;
2950 char lower[11]; /* Enough for any encoding shortcut */
2951
Fred Drakee4315f52000-05-09 19:53:39 +00002952 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002953 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002954 if ((strcmp(lower, "utf-8") == 0) ||
2955 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002956 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002957 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002958 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002959 (strcmp(lower, "iso-8859-1") == 0))
2960 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002961#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002962 else if (strcmp(lower, "mbcs") == 0)
2963 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002964#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002965 else if (strcmp(lower, "ascii") == 0)
2966 return PyUnicode_DecodeASCII(s, size, errors);
2967 else if (strcmp(lower, "utf-16") == 0)
2968 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2969 else if (strcmp(lower, "utf-32") == 0)
2970 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972
2973 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002974 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002975 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002976 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002977 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 if (buffer == NULL)
2979 goto onError;
2980 unicode = PyCodec_Decode(buffer, encoding, errors);
2981 if (unicode == NULL)
2982 goto onError;
2983 if (!PyUnicode_Check(unicode)) {
2984 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002985 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002986 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 Py_DECREF(unicode);
2988 goto onError;
2989 }
2990 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002991 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002992
Benjamin Peterson29060642009-01-31 22:14:21 +00002993 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 Py_XDECREF(buffer);
2995 return NULL;
2996}
2997
Alexander Belopolsky40018472011-02-26 01:02:56 +00002998PyObject *
2999PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003000 const char *encoding,
3001 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003002{
3003 PyObject *v;
3004
3005 if (!PyUnicode_Check(unicode)) {
3006 PyErr_BadArgument();
3007 goto onError;
3008 }
3009
3010 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012
3013 /* Decode via the codec registry */
3014 v = PyCodec_Decode(unicode, encoding, errors);
3015 if (v == NULL)
3016 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003017 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003018
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003020 return NULL;
3021}
3022
Alexander Belopolsky40018472011-02-26 01:02:56 +00003023PyObject *
3024PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003025 const char *encoding,
3026 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003027{
3028 PyObject *v;
3029
3030 if (!PyUnicode_Check(unicode)) {
3031 PyErr_BadArgument();
3032 goto onError;
3033 }
3034
3035 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037
3038 /* Decode via the codec registry */
3039 v = PyCodec_Decode(unicode, encoding, errors);
3040 if (v == NULL)
3041 goto onError;
3042 if (!PyUnicode_Check(v)) {
3043 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003044 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003045 Py_TYPE(v)->tp_name);
3046 Py_DECREF(v);
3047 goto onError;
3048 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003049 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003050
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003052 return NULL;
3053}
3054
Alexander Belopolsky40018472011-02-26 01:02:56 +00003055PyObject *
3056PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003057 Py_ssize_t size,
3058 const char *encoding,
3059 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060{
3061 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003062
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 unicode = PyUnicode_FromUnicode(s, size);
3064 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3067 Py_DECREF(unicode);
3068 return v;
3069}
3070
Alexander Belopolsky40018472011-02-26 01:02:56 +00003071PyObject *
3072PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003073 const char *encoding,
3074 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003075{
3076 PyObject *v;
3077
3078 if (!PyUnicode_Check(unicode)) {
3079 PyErr_BadArgument();
3080 goto onError;
3081 }
3082
3083 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003085
3086 /* Encode via the codec registry */
3087 v = PyCodec_Encode(unicode, encoding, errors);
3088 if (v == NULL)
3089 goto onError;
3090 return v;
3091
Benjamin Peterson29060642009-01-31 22:14:21 +00003092 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003093 return NULL;
3094}
3095
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003096static size_t
3097wcstombs_errorpos(const wchar_t *wstr)
3098{
3099 size_t len;
3100#if SIZEOF_WCHAR_T == 2
3101 wchar_t buf[3];
3102#else
3103 wchar_t buf[2];
3104#endif
3105 char outbuf[MB_LEN_MAX];
3106 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003107
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003108#if SIZEOF_WCHAR_T == 2
3109 buf[2] = 0;
3110#else
3111 buf[1] = 0;
3112#endif
3113 start = wstr;
3114 while (*wstr != L'\0')
3115 {
3116 previous = wstr;
3117#if SIZEOF_WCHAR_T == 2
3118 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3119 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3120 {
3121 buf[0] = wstr[0];
3122 buf[1] = wstr[1];
3123 wstr += 2;
3124 }
3125 else {
3126 buf[0] = *wstr;
3127 buf[1] = 0;
3128 wstr++;
3129 }
3130#else
3131 buf[0] = *wstr;
3132 wstr++;
3133#endif
3134 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003135 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003136 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003137 }
3138
3139 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003140 return 0;
3141}
3142
Victor Stinner1b579672011-12-17 05:47:23 +01003143static int
3144locale_error_handler(const char *errors, int *surrogateescape)
3145{
3146 if (errors == NULL) {
3147 *surrogateescape = 0;
3148 return 0;
3149 }
3150
3151 if (strcmp(errors, "strict") == 0) {
3152 *surrogateescape = 0;
3153 return 0;
3154 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003155 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003156 *surrogateescape = 1;
3157 return 0;
3158 }
3159 PyErr_Format(PyExc_ValueError,
3160 "only 'strict' and 'surrogateescape' error handlers "
3161 "are supported, not '%s'",
3162 errors);
3163 return -1;
3164}
3165
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003166PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003167PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168{
3169 Py_ssize_t wlen, wlen2;
3170 wchar_t *wstr;
3171 PyObject *bytes = NULL;
3172 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003173 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003174 PyObject *exc;
3175 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003176 int surrogateescape;
3177
3178 if (locale_error_handler(errors, &surrogateescape) < 0)
3179 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003180
3181 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3182 if (wstr == NULL)
3183 return NULL;
3184
3185 wlen2 = wcslen(wstr);
3186 if (wlen2 != wlen) {
3187 PyMem_Free(wstr);
3188 PyErr_SetString(PyExc_TypeError, "embedded null character");
3189 return NULL;
3190 }
3191
3192 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003193 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003194 char *str;
3195
3196 str = _Py_wchar2char(wstr, &error_pos);
3197 if (str == NULL) {
3198 if (error_pos == (size_t)-1) {
3199 PyErr_NoMemory();
3200 PyMem_Free(wstr);
3201 return NULL;
3202 }
3203 else {
3204 goto encode_error;
3205 }
3206 }
3207 PyMem_Free(wstr);
3208
3209 bytes = PyBytes_FromString(str);
3210 PyMem_Free(str);
3211 }
3212 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003213 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003214 size_t len, len2;
3215
3216 len = wcstombs(NULL, wstr, 0);
3217 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003218 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003219 goto encode_error;
3220 }
3221
3222 bytes = PyBytes_FromStringAndSize(NULL, len);
3223 if (bytes == NULL) {
3224 PyMem_Free(wstr);
3225 return NULL;
3226 }
3227
3228 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3229 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003230 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003231 goto encode_error;
3232 }
3233 PyMem_Free(wstr);
3234 }
3235 return bytes;
3236
3237encode_error:
3238 errmsg = strerror(errno);
3239 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003240
3241 if (error_pos == (size_t)-1)
3242 error_pos = wcstombs_errorpos(wstr);
3243
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003244 PyMem_Free(wstr);
3245 Py_XDECREF(bytes);
3246
Victor Stinner2f197072011-12-17 07:08:30 +01003247 if (errmsg != NULL) {
3248 size_t errlen;
3249 wstr = _Py_char2wchar(errmsg, &errlen);
3250 if (wstr != NULL) {
3251 reason = PyUnicode_FromWideChar(wstr, errlen);
3252 PyMem_Free(wstr);
3253 } else
3254 errmsg = NULL;
3255 }
3256 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003257 reason = PyUnicode_FromString(
3258 "wcstombs() encountered an unencodable "
3259 "wide character");
3260 if (reason == NULL)
3261 return NULL;
3262
3263 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3264 "locale", unicode,
3265 (Py_ssize_t)error_pos,
3266 (Py_ssize_t)(error_pos+1),
3267 reason);
3268 Py_DECREF(reason);
3269 if (exc != NULL) {
3270 PyCodec_StrictErrors(exc);
3271 Py_XDECREF(exc);
3272 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003273 return NULL;
3274}
3275
Victor Stinnerad158722010-10-27 00:25:46 +00003276PyObject *
3277PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003278{
Victor Stinner99b95382011-07-04 14:23:54 +02003279#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003280 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003281#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003282 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003283#else
Victor Stinner793b5312011-04-27 00:24:21 +02003284 PyInterpreterState *interp = PyThreadState_GET()->interp;
3285 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3286 cannot use it to encode and decode filenames before it is loaded. Load
3287 the Python codec requires to encode at least its own filename. Use the C
3288 version of the locale codec until the codec registry is initialized and
3289 the Python codec is loaded.
3290
3291 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3292 cannot only rely on it: check also interp->fscodec_initialized for
3293 subinterpreters. */
3294 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003295 return PyUnicode_AsEncodedString(unicode,
3296 Py_FileSystemDefaultEncoding,
3297 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003298 }
3299 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003300 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003301 }
Victor Stinnerad158722010-10-27 00:25:46 +00003302#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003303}
3304
Alexander Belopolsky40018472011-02-26 01:02:56 +00003305PyObject *
3306PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003307 const char *encoding,
3308 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309{
3310 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003311 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003312
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 if (!PyUnicode_Check(unicode)) {
3314 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 }
Fred Drakee4315f52000-05-09 19:53:39 +00003317
Fred Drakee4315f52000-05-09 19:53:39 +00003318 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003319 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003320 if ((strcmp(lower, "utf-8") == 0) ||
3321 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003322 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003323 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003324 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003325 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003326 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003327 }
Victor Stinner37296e82010-06-10 13:36:23 +00003328 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003329 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003330 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003331 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003332#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003333 else if (strcmp(lower, "mbcs") == 0)
3334 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003335#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003336 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003337 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339
3340 /* Encode via the codec registry */
3341 v = PyCodec_Encode(unicode, encoding, errors);
3342 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003343 return NULL;
3344
3345 /* The normal path */
3346 if (PyBytes_Check(v))
3347 return v;
3348
3349 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003350 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003351 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003352 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003353
3354 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3355 "encoder %s returned bytearray instead of bytes",
3356 encoding);
3357 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003358 Py_DECREF(v);
3359 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003360 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003361
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003362 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3363 Py_DECREF(v);
3364 return b;
3365 }
3366
3367 PyErr_Format(PyExc_TypeError,
3368 "encoder did not return a bytes object (type=%.400s)",
3369 Py_TYPE(v)->tp_name);
3370 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003371 return NULL;
3372}
3373
Alexander Belopolsky40018472011-02-26 01:02:56 +00003374PyObject *
3375PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003376 const char *encoding,
3377 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003378{
3379 PyObject *v;
3380
3381 if (!PyUnicode_Check(unicode)) {
3382 PyErr_BadArgument();
3383 goto onError;
3384 }
3385
3386 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003387 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003388
3389 /* Encode via the codec registry */
3390 v = PyCodec_Encode(unicode, encoding, errors);
3391 if (v == NULL)
3392 goto onError;
3393 if (!PyUnicode_Check(v)) {
3394 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003395 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003396 Py_TYPE(v)->tp_name);
3397 Py_DECREF(v);
3398 goto onError;
3399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003401
Benjamin Peterson29060642009-01-31 22:14:21 +00003402 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 return NULL;
3404}
3405
Victor Stinner2f197072011-12-17 07:08:30 +01003406static size_t
3407mbstowcs_errorpos(const char *str, size_t len)
3408{
3409#ifdef HAVE_MBRTOWC
3410 const char *start = str;
3411 mbstate_t mbs;
3412 size_t converted;
3413 wchar_t ch;
3414
3415 memset(&mbs, 0, sizeof mbs);
3416 while (len)
3417 {
3418 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3419 if (converted == 0)
3420 /* Reached end of string */
3421 break;
3422 if (converted == (size_t)-1 || converted == (size_t)-2) {
3423 /* Conversion error or incomplete character */
3424 return str - start;
3425 }
3426 else {
3427 str += converted;
3428 len -= converted;
3429 }
3430 }
3431 /* failed to find the undecodable byte sequence */
3432 return 0;
3433#endif
3434 return 0;
3435}
3436
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003437PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003438PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003439 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003440{
3441 wchar_t smallbuf[256];
3442 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3443 wchar_t *wstr;
3444 size_t wlen, wlen2;
3445 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003446 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003447 size_t error_pos;
3448 char *errmsg;
3449 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003450
3451 if (locale_error_handler(errors, &surrogateescape) < 0)
3452 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003453
3454 if (str[len] != '\0' || len != strlen(str)) {
3455 PyErr_SetString(PyExc_TypeError, "embedded null character");
3456 return NULL;
3457 }
3458
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003459 if (surrogateescape) {
3460 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003461 wstr = _Py_char2wchar(str, &wlen);
3462 if (wstr == NULL) {
3463 if (wlen == (size_t)-1)
3464 PyErr_NoMemory();
3465 else
3466 PyErr_SetFromErrno(PyExc_OSError);
3467 return NULL;
3468 }
3469
3470 unicode = PyUnicode_FromWideChar(wstr, wlen);
3471 PyMem_Free(wstr);
3472 }
3473 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003474 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003475#ifndef HAVE_BROKEN_MBSTOWCS
3476 wlen = mbstowcs(NULL, str, 0);
3477#else
3478 wlen = len;
3479#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003480 if (wlen == (size_t)-1)
3481 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003482 if (wlen+1 <= smallbuf_len) {
3483 wstr = smallbuf;
3484 }
3485 else {
3486 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3487 return PyErr_NoMemory();
3488
3489 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3490 if (!wstr)
3491 return PyErr_NoMemory();
3492 }
3493
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003494 wlen2 = mbstowcs(wstr, str, wlen+1);
3495 if (wlen2 == (size_t)-1) {
3496 if (wstr != smallbuf)
3497 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003498 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003499 }
3500#ifdef HAVE_BROKEN_MBSTOWCS
3501 assert(wlen2 == wlen);
3502#endif
3503 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3504 if (wstr != smallbuf)
3505 PyMem_Free(wstr);
3506 }
3507 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003508
3509decode_error:
3510 errmsg = strerror(errno);
3511 assert(errmsg != NULL);
3512
3513 error_pos = mbstowcs_errorpos(str, len);
3514 if (errmsg != NULL) {
3515 size_t errlen;
3516 wstr = _Py_char2wchar(errmsg, &errlen);
3517 if (wstr != NULL) {
3518 reason = PyUnicode_FromWideChar(wstr, errlen);
3519 PyMem_Free(wstr);
3520 } else
3521 errmsg = NULL;
3522 }
3523 if (errmsg == NULL)
3524 reason = PyUnicode_FromString(
3525 "mbstowcs() encountered an invalid multibyte sequence");
3526 if (reason == NULL)
3527 return NULL;
3528
3529 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3530 "locale", str, len,
3531 (Py_ssize_t)error_pos,
3532 (Py_ssize_t)(error_pos+1),
3533 reason);
3534 Py_DECREF(reason);
3535 if (exc != NULL) {
3536 PyCodec_StrictErrors(exc);
3537 Py_XDECREF(exc);
3538 }
3539 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003540}
3541
3542PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003543PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003544{
3545 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003546 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547}
3548
3549
3550PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003551PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003552 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003553 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3554}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003555
Christian Heimes5894ba72007-11-04 11:43:14 +00003556PyObject*
3557PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3558{
Victor Stinner99b95382011-07-04 14:23:54 +02003559#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003560 return PyUnicode_DecodeMBCS(s, size, NULL);
3561#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003562 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003563#else
Victor Stinner793b5312011-04-27 00:24:21 +02003564 PyInterpreterState *interp = PyThreadState_GET()->interp;
3565 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3566 cannot use it to encode and decode filenames before it is loaded. Load
3567 the Python codec requires to encode at least its own filename. Use the C
3568 version of the locale codec until the codec registry is initialized and
3569 the Python codec is loaded.
3570
3571 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3572 cannot only rely on it: check also interp->fscodec_initialized for
3573 subinterpreters. */
3574 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003575 return PyUnicode_Decode(s, size,
3576 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003577 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003578 }
3579 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003580 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003581 }
Victor Stinnerad158722010-10-27 00:25:46 +00003582#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003583}
3584
Martin v. Löwis011e8422009-05-05 04:43:17 +00003585
3586int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003587_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003588{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003589 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003590
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003591 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003592 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003593 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3594 PyUnicode_GET_LENGTH(str), '\0', 1);
3595 if (pos == -1)
3596 return 0;
3597 else
3598 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003599}
3600
Antoine Pitrou13348842012-01-29 18:36:34 +01003601int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003602PyUnicode_FSConverter(PyObject* arg, void* addr)
3603{
3604 PyObject *output = NULL;
3605 Py_ssize_t size;
3606 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003607 if (arg == NULL) {
3608 Py_DECREF(*(PyObject**)addr);
3609 return 1;
3610 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003611 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003612 output = arg;
3613 Py_INCREF(output);
3614 }
3615 else {
3616 arg = PyUnicode_FromObject(arg);
3617 if (!arg)
3618 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003619 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620 Py_DECREF(arg);
3621 if (!output)
3622 return 0;
3623 if (!PyBytes_Check(output)) {
3624 Py_DECREF(output);
3625 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3626 return 0;
3627 }
3628 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003629 size = PyBytes_GET_SIZE(output);
3630 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003631 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003632 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003633 Py_DECREF(output);
3634 return 0;
3635 }
3636 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003637 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003638}
3639
3640
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003641int
3642PyUnicode_FSDecoder(PyObject* arg, void* addr)
3643{
3644 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003645 if (arg == NULL) {
3646 Py_DECREF(*(PyObject**)addr);
3647 return 1;
3648 }
3649 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003650 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003651 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003652 output = arg;
3653 Py_INCREF(output);
3654 }
3655 else {
3656 arg = PyBytes_FromObject(arg);
3657 if (!arg)
3658 return 0;
3659 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3660 PyBytes_GET_SIZE(arg));
3661 Py_DECREF(arg);
3662 if (!output)
3663 return 0;
3664 if (!PyUnicode_Check(output)) {
3665 Py_DECREF(output);
3666 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3667 return 0;
3668 }
3669 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003670 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003671 Py_DECREF(output);
3672 return 0;
3673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003674 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003675 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003676 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3677 Py_DECREF(output);
3678 return 0;
3679 }
3680 *(PyObject**)addr = output;
3681 return Py_CLEANUP_SUPPORTED;
3682}
3683
3684
Martin v. Löwis5b222132007-06-10 09:51:05 +00003685char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003686PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003687{
Christian Heimesf3863112007-11-22 07:46:41 +00003688 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003689
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003690 if (!PyUnicode_Check(unicode)) {
3691 PyErr_BadArgument();
3692 return NULL;
3693 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003694 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003695 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003696
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003697 if (PyUnicode_UTF8(unicode) == NULL) {
3698 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003699 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3700 if (bytes == NULL)
3701 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003702 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3703 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704 Py_DECREF(bytes);
3705 return NULL;
3706 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003707 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3708 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3709 PyBytes_AS_STRING(bytes),
3710 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 Py_DECREF(bytes);
3712 }
3713
3714 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003715 *psize = PyUnicode_UTF8_LENGTH(unicode);
3716 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003717}
3718
3719char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3723}
3724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003725Py_UNICODE *
3726PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728 const unsigned char *one_byte;
3729#if SIZEOF_WCHAR_T == 4
3730 const Py_UCS2 *two_bytes;
3731#else
3732 const Py_UCS4 *four_bytes;
3733 const Py_UCS4 *ucs4_end;
3734 Py_ssize_t num_surrogates;
3735#endif
3736 wchar_t *w;
3737 wchar_t *wchar_end;
3738
3739 if (!PyUnicode_Check(unicode)) {
3740 PyErr_BadArgument();
3741 return NULL;
3742 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003743 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003745 assert(_PyUnicode_KIND(unicode) != 0);
3746 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003748 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3751 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 num_surrogates = 0;
3753
3754 for (; four_bytes < ucs4_end; ++four_bytes) {
3755 if (*four_bytes > 0xFFFF)
3756 ++num_surrogates;
3757 }
3758
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3760 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3761 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762 PyErr_NoMemory();
3763 return NULL;
3764 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003767 w = _PyUnicode_WSTR(unicode);
3768 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3769 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3771 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003772 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003774 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3775 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 }
3777 else
3778 *w = *four_bytes;
3779
3780 if (w > wchar_end) {
3781 assert(0 && "Miscalculated string end");
3782 }
3783 }
3784 *w = 0;
3785#else
3786 /* sizeof(wchar_t) == 4 */
3787 Py_FatalError("Impossible unicode object state, wstr and str "
3788 "should share memory already.");
3789 return NULL;
3790#endif
3791 }
3792 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003793 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3794 (_PyUnicode_LENGTH(unicode) + 1));
3795 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 PyErr_NoMemory();
3797 return NULL;
3798 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003799 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3800 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3801 w = _PyUnicode_WSTR(unicode);
3802 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003804 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3805 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 for (; w < wchar_end; ++one_byte, ++w)
3807 *w = *one_byte;
3808 /* null-terminate the wstr */
3809 *w = 0;
3810 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 for (; w < wchar_end; ++two_bytes, ++w)
3815 *w = *two_bytes;
3816 /* null-terminate the wstr */
3817 *w = 0;
3818#else
3819 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003820 PyObject_FREE(_PyUnicode_WSTR(unicode));
3821 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 Py_FatalError("Impossible unicode object state, wstr "
3823 "and str should share memory already.");
3824 return NULL;
3825#endif
3826 }
3827 else {
3828 assert(0 && "This should never happen.");
3829 }
3830 }
3831 }
3832 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 *size = PyUnicode_WSTR_LENGTH(unicode);
3834 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003835}
3836
Alexander Belopolsky40018472011-02-26 01:02:56 +00003837Py_UNICODE *
3838PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841}
3842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843
Alexander Belopolsky40018472011-02-26 01:02:56 +00003844Py_ssize_t
3845PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846{
3847 if (!PyUnicode_Check(unicode)) {
3848 PyErr_BadArgument();
3849 goto onError;
3850 }
3851 return PyUnicode_GET_SIZE(unicode);
3852
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 return -1;
3855}
3856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857Py_ssize_t
3858PyUnicode_GetLength(PyObject *unicode)
3859{
Victor Stinner07621332012-06-16 04:53:46 +02003860 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 PyErr_BadArgument();
3862 return -1;
3863 }
Victor Stinner07621332012-06-16 04:53:46 +02003864 if (PyUnicode_READY(unicode) == -1)
3865 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866 return PyUnicode_GET_LENGTH(unicode);
3867}
3868
3869Py_UCS4
3870PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3871{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003872 void *data;
3873 int kind;
3874
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003875 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3876 PyErr_BadArgument();
3877 return (Py_UCS4)-1;
3878 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003879 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003880 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 return (Py_UCS4)-1;
3882 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003883 data = PyUnicode_DATA(unicode);
3884 kind = PyUnicode_KIND(unicode);
3885 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886}
3887
3888int
3889PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3890{
3891 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003892 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 return -1;
3894 }
Victor Stinner488fa492011-12-12 00:01:39 +01003895 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003896 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003897 PyErr_SetString(PyExc_IndexError, "string index out of range");
3898 return -1;
3899 }
Victor Stinner488fa492011-12-12 00:01:39 +01003900 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003901 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003902 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3903 PyErr_SetString(PyExc_ValueError, "character out of range");
3904 return -1;
3905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3907 index, ch);
3908 return 0;
3909}
3910
Alexander Belopolsky40018472011-02-26 01:02:56 +00003911const char *
3912PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003913{
Victor Stinner42cb4622010-09-01 19:39:01 +00003914 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003915}
3916
Victor Stinner554f3f02010-06-16 23:33:54 +00003917/* create or adjust a UnicodeDecodeError */
3918static void
3919make_decode_exception(PyObject **exceptionObject,
3920 const char *encoding,
3921 const char *input, Py_ssize_t length,
3922 Py_ssize_t startpos, Py_ssize_t endpos,
3923 const char *reason)
3924{
3925 if (*exceptionObject == NULL) {
3926 *exceptionObject = PyUnicodeDecodeError_Create(
3927 encoding, input, length, startpos, endpos, reason);
3928 }
3929 else {
3930 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3931 goto onError;
3932 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3933 goto onError;
3934 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3935 goto onError;
3936 }
3937 return;
3938
3939onError:
3940 Py_DECREF(*exceptionObject);
3941 *exceptionObject = NULL;
3942}
3943
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003944#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945/* error handling callback helper:
3946 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003947 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 and adjust various state variables.
3949 return 0 on success, -1 on error
3950*/
3951
Alexander Belopolsky40018472011-02-26 01:02:56 +00003952static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003953unicode_decode_call_errorhandler_wchar(
3954 const char *errors, PyObject **errorHandler,
3955 const char *encoding, const char *reason,
3956 const char **input, const char **inend, Py_ssize_t *startinpos,
3957 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3958 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003960 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961
3962 PyObject *restuple = NULL;
3963 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003964 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003965 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003966 Py_ssize_t requiredsize;
3967 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003968 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003969 wchar_t *repwstr;
3970 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003972 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3973 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003974
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 *errorHandler = PyCodec_LookupError(errors);
3977 if (*errorHandler == NULL)
3978 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 }
3980
Victor Stinner554f3f02010-06-16 23:33:54 +00003981 make_decode_exception(exceptionObject,
3982 encoding,
3983 *input, *inend - *input,
3984 *startinpos, *endinpos,
3985 reason);
3986 if (*exceptionObject == NULL)
3987 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988
3989 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3990 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003993 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 }
3996 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003998
3999 /* Copy back the bytes variables, which might have been modified by the
4000 callback */
4001 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4002 if (!inputobj)
4003 goto onError;
4004 if (!PyBytes_Check(inputobj)) {
4005 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4006 }
4007 *input = PyBytes_AS_STRING(inputobj);
4008 insize = PyBytes_GET_SIZE(inputobj);
4009 *inend = *input + insize;
4010 /* we can DECREF safely, as the exception has another reference,
4011 so the object won't go away. */
4012 Py_DECREF(inputobj);
4013
4014 if (newpos<0)
4015 newpos = insize+newpos;
4016 if (newpos<0 || newpos>insize) {
4017 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4018 goto onError;
4019 }
4020
4021 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4022 if (repwstr == NULL)
4023 goto onError;
4024 /* need more space? (at least enough for what we
4025 have+the replacement+the rest of the string (starting
4026 at the new input position), so we won't have to check space
4027 when there are no errors in the rest of the string) */
4028 requiredsize = *outpos + repwlen + insize-newpos;
4029 if (requiredsize > outsize) {
4030 if (requiredsize < 2*outsize)
4031 requiredsize = 2*outsize;
4032 if (unicode_resize(output, requiredsize) < 0)
4033 goto onError;
4034 }
4035 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4036 *outpos += repwlen;
4037
4038 *endinpos = newpos;
4039 *inptr = *input + newpos;
4040
4041 /* we made it! */
4042 Py_XDECREF(restuple);
4043 return 0;
4044
4045 onError:
4046 Py_XDECREF(restuple);
4047 return -1;
4048}
4049#endif /* HAVE_MBCS */
4050
4051static int
4052unicode_decode_call_errorhandler_writer(
4053 const char *errors, PyObject **errorHandler,
4054 const char *encoding, const char *reason,
4055 const char **input, const char **inend, Py_ssize_t *startinpos,
4056 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4057 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4058{
4059 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4060
4061 PyObject *restuple = NULL;
4062 PyObject *repunicode = NULL;
4063 Py_ssize_t insize;
4064 Py_ssize_t newpos;
4065 PyObject *inputobj = NULL;
4066
4067 if (*errorHandler == NULL) {
4068 *errorHandler = PyCodec_LookupError(errors);
4069 if (*errorHandler == NULL)
4070 goto onError;
4071 }
4072
4073 make_decode_exception(exceptionObject,
4074 encoding,
4075 *input, *inend - *input,
4076 *startinpos, *endinpos,
4077 reason);
4078 if (*exceptionObject == NULL)
4079 goto onError;
4080
4081 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4082 if (restuple == NULL)
4083 goto onError;
4084 if (!PyTuple_Check(restuple)) {
4085 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4086 goto onError;
4087 }
4088 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004089 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004090
4091 /* Copy back the bytes variables, which might have been modified by the
4092 callback */
4093 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4094 if (!inputobj)
4095 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004096 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004098 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004099 *input = PyBytes_AS_STRING(inputobj);
4100 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004101 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004102 /* we can DECREF safely, as the exception has another reference,
4103 so the object won't go away. */
4104 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004108 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4110 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004111 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004113 writer->overallocate = 1;
4114 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4115 return
4116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004118 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004121 Py_XDECREF(restuple);
4122 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004126 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127}
4128
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004129/* --- UTF-7 Codec -------------------------------------------------------- */
4130
Antoine Pitrou244651a2009-05-04 18:56:13 +00004131/* See RFC2152 for details. We encode conservatively and decode liberally. */
4132
4133/* Three simple macros defining base-64. */
4134
4135/* Is c a base-64 character? */
4136
4137#define IS_BASE64(c) \
4138 (((c) >= 'A' && (c) <= 'Z') || \
4139 ((c) >= 'a' && (c) <= 'z') || \
4140 ((c) >= '0' && (c) <= '9') || \
4141 (c) == '+' || (c) == '/')
4142
4143/* given that c is a base-64 character, what is its base-64 value? */
4144
4145#define FROM_BASE64(c) \
4146 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4147 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4148 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4149 (c) == '+' ? 62 : 63)
4150
4151/* What is the base-64 character of the bottom 6 bits of n? */
4152
4153#define TO_BASE64(n) \
4154 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4155
4156/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4157 * decoded as itself. We are permissive on decoding; the only ASCII
4158 * byte not decoding to itself is the + which begins a base64
4159 * string. */
4160
4161#define DECODE_DIRECT(c) \
4162 ((c) <= 127 && (c) != '+')
4163
4164/* The UTF-7 encoder treats ASCII characters differently according to
4165 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4166 * the above). See RFC2152. This array identifies these different
4167 * sets:
4168 * 0 : "Set D"
4169 * alphanumeric and '(),-./:?
4170 * 1 : "Set O"
4171 * !"#$%&*;<=>@[]^_`{|}
4172 * 2 : "whitespace"
4173 * ht nl cr sp
4174 * 3 : special (must be base64 encoded)
4175 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4176 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004177
Tim Petersced69f82003-09-16 20:30:58 +00004178static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004179char utf7_category[128] = {
4180/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4181 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4182/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4184/* sp ! " # $ % & ' ( ) * + , - . / */
4185 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4186/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4188/* @ A B C D E F G H I J K L M N O */
4189 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4190/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4192/* ` a b c d e f g h i j k l m n o */
4193 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4194/* p q r s t u v w x y z { | } ~ del */
4195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004196};
4197
Antoine Pitrou244651a2009-05-04 18:56:13 +00004198/* ENCODE_DIRECT: this character should be encoded as itself. The
4199 * answer depends on whether we are encoding set O as itself, and also
4200 * on whether we are encoding whitespace as itself. RFC2152 makes it
4201 * clear that the answers to these questions vary between
4202 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004203
Antoine Pitrou244651a2009-05-04 18:56:13 +00004204#define ENCODE_DIRECT(c, directO, directWS) \
4205 ((c) < 128 && (c) > 0 && \
4206 ((utf7_category[(c)] == 0) || \
4207 (directWS && (utf7_category[(c)] == 2)) || \
4208 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004209
Alexander Belopolsky40018472011-02-26 01:02:56 +00004210PyObject *
4211PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004212 Py_ssize_t size,
4213 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004214{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004215 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4216}
4217
Antoine Pitrou244651a2009-05-04 18:56:13 +00004218/* The decoder. The only state we preserve is our read position,
4219 * i.e. how many characters we have consumed. So if we end in the
4220 * middle of a shift sequence we have to back off the read position
4221 * and the output to the beginning of the sequence, otherwise we lose
4222 * all the shift state (seen bits, number of bits seen, high
4223 * surrogate). */
4224
Alexander Belopolsky40018472011-02-26 01:02:56 +00004225PyObject *
4226PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004227 Py_ssize_t size,
4228 const char *errors,
4229 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004230{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004232 Py_ssize_t startinpos;
4233 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004234 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004236 const char *errmsg = "";
4237 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004238 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004239 unsigned int base64bits = 0;
4240 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004241 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 PyObject *errorHandler = NULL;
4243 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004244
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004245 if (size == 0) {
4246 if (consumed)
4247 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004248 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004249 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004250
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251 /* Start off assuming it's all ASCII. Widen later as necessary. */
4252 _PyUnicodeWriter_Init(&writer, 0);
4253 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4254 goto onError;
4255
4256 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004257 e = s + size;
4258
4259 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004260 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004262 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263
Antoine Pitrou244651a2009-05-04 18:56:13 +00004264 if (inShift) { /* in a base-64 section */
4265 if (IS_BASE64(ch)) { /* consume a base-64 character */
4266 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4267 base64bits += 6;
4268 s++;
4269 if (base64bits >= 16) {
4270 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004271 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272 base64bits -= 16;
4273 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4274 if (surrogate) {
4275 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004276 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4277 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004278 if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004279 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004280 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2);
4281 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004283 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004284 }
4285 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004287 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004288 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4289 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004291 }
4292 }
Victor Stinner551ac952011-11-29 22:58:13 +01004293 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004294 /* first surrogate */
4295 surrogate = outCh;
4296 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004299 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004300 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh);
4301 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004302 }
4303 }
4304 }
4305 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306 inShift = 0;
4307 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004308 if (surrogate) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004310 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4312 writer.pos++;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004313 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004314 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315 if (base64bits > 0) { /* left-over bits */
4316 if (base64bits >= 6) {
4317 /* We've seen at least one base-64 character */
4318 errmsg = "partial character in shift sequence";
4319 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004320 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 else {
4322 /* Some bits remain; they should be zero */
4323 if (base64buffer != 0) {
4324 errmsg = "non-zero padding bits in shift sequence";
4325 goto utf7Error;
4326 }
4327 }
4328 }
4329 if (ch != '-') {
4330 /* '-' is absorbed; other terminating
4331 characters are preserved */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004334 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4335 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004337 }
4338 }
4339 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 s++; /* consume '+' */
4342 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004345 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+');
4347 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 }
4349 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 }
4354 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004356 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004357 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4358 goto onError;
4359 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4360 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 else {
4363 startinpos = s-starts;
4364 s++;
4365 errmsg = "unexpected special character";
4366 goto utf7Error;
4367 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 errors, &errorHandler,
4373 "utf7", errmsg,
4374 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004375 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 }
4378
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 /* end of string */
4380
4381 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4382 /* if we're in an inconsistent state, that's an error */
4383 if (surrogate ||
4384 (base64bits >= 6) ||
4385 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004387 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 errors, &errorHandler,
4389 "utf7", "unterminated shift sequence",
4390 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004391 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392 goto onError;
4393 if (s < e)
4394 goto restart;
4395 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397
4398 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004399 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004401 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004402 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 }
4404 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004405 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004407 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 Py_XDECREF(errorHandler);
4410 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 Py_XDECREF(errorHandler);
4415 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 return NULL;
4418}
4419
4420
Alexander Belopolsky40018472011-02-26 01:02:56 +00004421PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004422_PyUnicode_EncodeUTF7(PyObject *str,
4423 int base64SetO,
4424 int base64WhiteSpace,
4425 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004427 int kind;
4428 void *data;
4429 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004430 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004432 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 unsigned int base64bits = 0;
4434 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435 char * out;
4436 char * start;
4437
Benjamin Petersonbac79492012-01-14 13:34:47 -05004438 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004439 return NULL;
4440 kind = PyUnicode_KIND(str);
4441 data = PyUnicode_DATA(str);
4442 len = PyUnicode_GET_LENGTH(str);
4443
4444 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004446
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004447 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004448 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004449 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004450 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451 if (v == NULL)
4452 return NULL;
4453
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004454 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004455 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004456 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 if (inShift) {
4459 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4460 /* shifting out */
4461 if (base64bits) { /* output remaining bits */
4462 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4463 base64buffer = 0;
4464 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 }
4466 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 /* Characters not in the BASE64 set implicitly unshift the sequence
4468 so no '-' is required, except if the character is itself a '-' */
4469 if (IS_BASE64(ch) || ch == '-') {
4470 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 *out++ = (char) ch;
4473 }
4474 else {
4475 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004476 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 else { /* not in a shift sequence */
4479 if (ch == '+') {
4480 *out++ = '+';
4481 *out++ = '-';
4482 }
4483 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4484 *out++ = (char) ch;
4485 }
4486 else {
4487 *out++ = '+';
4488 inShift = 1;
4489 goto encode_char;
4490 }
4491 }
4492 continue;
4493encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004495 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004496
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 /* code first surrogate */
4498 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004499 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 while (base64bits >= 6) {
4501 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4502 base64bits -= 6;
4503 }
4504 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004505 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 base64bits += 16;
4508 base64buffer = (base64buffer << 16) | ch;
4509 while (base64bits >= 6) {
4510 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4511 base64bits -= 6;
4512 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004513 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 if (base64bits)
4515 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4516 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004518 if (_PyBytes_Resize(&v, out - start) < 0)
4519 return NULL;
4520 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004522PyObject *
4523PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4524 Py_ssize_t size,
4525 int base64SetO,
4526 int base64WhiteSpace,
4527 const char *errors)
4528{
4529 PyObject *result;
4530 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4531 if (tmp == NULL)
4532 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004533 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004534 base64WhiteSpace, errors);
4535 Py_DECREF(tmp);
4536 return result;
4537}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539#undef IS_BASE64
4540#undef FROM_BASE64
4541#undef TO_BASE64
4542#undef DECODE_DIRECT
4543#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545/* --- UTF-8 Codec -------------------------------------------------------- */
4546
Alexander Belopolsky40018472011-02-26 01:02:56 +00004547PyObject *
4548PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004549 Py_ssize_t size,
4550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551{
Walter Dörwald69652032004-09-07 20:24:22 +00004552 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4553}
4554
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004555#include "stringlib/asciilib.h"
4556#include "stringlib/codecs.h"
4557#include "stringlib/undef.h"
4558
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004559#include "stringlib/ucs1lib.h"
4560#include "stringlib/codecs.h"
4561#include "stringlib/undef.h"
4562
4563#include "stringlib/ucs2lib.h"
4564#include "stringlib/codecs.h"
4565#include "stringlib/undef.h"
4566
4567#include "stringlib/ucs4lib.h"
4568#include "stringlib/codecs.h"
4569#include "stringlib/undef.h"
4570
Antoine Pitrouab868312009-01-10 15:40:25 +00004571/* Mask to quickly check whether a C 'long' contains a
4572 non-ASCII, UTF8-encoded char. */
4573#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004574# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004575#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004576# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004577#else
4578# error C 'long' size should be either 4 or 8!
4579#endif
4580
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004581static Py_ssize_t
4582ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004584 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004585 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004586
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004587#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004588 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4589 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004590 /* Fast path, see in STRINGLIB(utf8_decode) for
4591 an explanation. */
4592 /* Help register allocation */
4593 register const char *_p = p;
4594 register Py_UCS1 * q = dest;
4595 while (_p < aligned_end) {
4596 unsigned long value = *(const unsigned long *) _p;
4597 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004599 *((unsigned long *)q) = value;
4600 _p += SIZEOF_LONG;
4601 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004602 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004603 p = _p;
4604 while (p < end) {
4605 if ((unsigned char)*p & 0x80)
4606 break;
4607 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004609 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004611#endif
4612 while (p < end) {
4613 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4614 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004615 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004616 /* Help register allocation */
4617 register const char *_p = p;
4618 while (_p < aligned_end) {
4619 unsigned long value = *(unsigned long *) _p;
4620 if (value & ASCII_CHAR_MASK)
4621 break;
4622 _p += SIZEOF_LONG;
4623 }
4624 p = _p;
4625 if (_p == end)
4626 break;
4627 }
4628 if ((unsigned char)*p & 0x80)
4629 break;
4630 ++p;
4631 }
4632 memcpy(dest, start, p - start);
4633 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634}
Antoine Pitrouab868312009-01-10 15:40:25 +00004635
Victor Stinner785938e2011-12-11 20:09:03 +01004636PyObject *
4637PyUnicode_DecodeUTF8Stateful(const char *s,
4638 Py_ssize_t size,
4639 const char *errors,
4640 Py_ssize_t *consumed)
4641{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004642 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004643 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004645
4646 Py_ssize_t startinpos;
4647 Py_ssize_t endinpos;
4648 const char *errmsg = "";
4649 PyObject *errorHandler = NULL;
4650 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004651
4652 if (size == 0) {
4653 if (consumed)
4654 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004655 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004656 }
4657
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4659 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004660 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004661 *consumed = 1;
4662 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004663 }
4664
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004665 _PyUnicodeWriter_Init(&writer, 0);
4666 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4667 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004668
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 writer.pos = ascii_decode(s, end, writer.data);
4670 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671 while (s < end) {
4672 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004675 if (PyUnicode_IS_ASCII(writer.buffer))
4676 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004678 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004679 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 } else {
4682 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004683 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684 }
4685
4686 switch (ch) {
4687 case 0:
4688 if (s == end || consumed)
4689 goto End;
4690 errmsg = "unexpected end of data";
4691 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004692 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 break;
4694 case 1:
4695 errmsg = "invalid start byte";
4696 startinpos = s - starts;
4697 endinpos = startinpos + 1;
4698 break;
4699 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004700 case 3:
4701 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 errmsg = "invalid continuation byte";
4703 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004704 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004705 break;
4706 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004707 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004708 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004709 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4710 writer.pos++;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 continue;
4712 }
4713
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004714 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004715 errors, &errorHandler,
4716 "utf-8", errmsg,
4717 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004719 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004720 }
4721
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723 if (consumed)
4724 *consumed = s - starts;
4725
4726 Py_XDECREF(errorHandler);
4727 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004728 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729
4730onError:
4731 Py_XDECREF(errorHandler);
4732 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004733 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004735}
4736
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004737#ifdef __APPLE__
4738
4739/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004740 used to decode the command line arguments on Mac OS X.
4741
4742 Return a pointer to a newly allocated wide character string (use
4743 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004744
4745wchar_t*
4746_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4747{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004748 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749 wchar_t *unicode;
4750 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004751
4752 /* Note: size will always be longer than the resulting Unicode
4753 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004754 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004755 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004756 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4757 if (!unicode)
4758 return NULL;
4759
4760 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004761 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004763 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004765#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004766 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004767#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004769#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 if (ch > 0xFF) {
4771#if SIZEOF_WCHAR_T == 4
4772 assert(0);
4773#else
4774 assert(Py_UNICODE_IS_SURROGATE(ch));
4775 /* compute and append the two surrogates: */
4776 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4777 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4778#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004779 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004780 else {
4781 if (!ch && s == e)
4782 break;
4783 /* surrogateescape */
4784 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4785 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004786 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004788 return unicode;
4789}
4790
4791#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004793/* Primary internal function which creates utf8 encoded bytes objects.
4794
4795 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004796 and allocate exactly as much space needed at the end. Else allocate the
4797 maximum possible needed (4 result bytes per Unicode character), and return
4798 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004799*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004800PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004801_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802{
Victor Stinner6099a032011-12-18 14:22:26 +01004803 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004804 void *data;
4805 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004807 if (!PyUnicode_Check(unicode)) {
4808 PyErr_BadArgument();
4809 return NULL;
4810 }
4811
4812 if (PyUnicode_READY(unicode) == -1)
4813 return NULL;
4814
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004815 if (PyUnicode_UTF8(unicode))
4816 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4817 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004818
4819 kind = PyUnicode_KIND(unicode);
4820 data = PyUnicode_DATA(unicode);
4821 size = PyUnicode_GET_LENGTH(unicode);
4822
Benjamin Petersonead6b532011-12-20 17:23:42 -06004823 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004824 default:
4825 assert(0);
4826 case PyUnicode_1BYTE_KIND:
4827 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4828 assert(!PyUnicode_IS_ASCII(unicode));
4829 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4830 case PyUnicode_2BYTE_KIND:
4831 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4832 case PyUnicode_4BYTE_KIND:
4833 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835}
4836
Alexander Belopolsky40018472011-02-26 01:02:56 +00004837PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004838PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4839 Py_ssize_t size,
4840 const char *errors)
4841{
4842 PyObject *v, *unicode;
4843
4844 unicode = PyUnicode_FromUnicode(s, size);
4845 if (unicode == NULL)
4846 return NULL;
4847 v = _PyUnicode_AsUTF8String(unicode, errors);
4848 Py_DECREF(unicode);
4849 return v;
4850}
4851
4852PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004853PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004855 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856}
4857
Walter Dörwald41980ca2007-08-16 21:55:45 +00004858/* --- UTF-32 Codec ------------------------------------------------------- */
4859
4860PyObject *
4861PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 Py_ssize_t size,
4863 const char *errors,
4864 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004865{
4866 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4867}
4868
4869PyObject *
4870PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 Py_ssize_t size,
4872 const char *errors,
4873 int *byteorder,
4874 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004875{
4876 const char *starts = s;
4877 Py_ssize_t startinpos;
4878 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004879 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004880 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004881 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004882 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004883 PyObject *errorHandler = NULL;
4884 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004885
Walter Dörwald41980ca2007-08-16 21:55:45 +00004886 q = (unsigned char *)s;
4887 e = q + size;
4888
4889 if (byteorder)
4890 bo = *byteorder;
4891
4892 /* Check for BOM marks (U+FEFF) in the input and adjust current
4893 byte order setting accordingly. In native mode, the leading BOM
4894 mark is skipped, in all other modes, it is copied to the output
4895 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004896 if (bo == 0 && size >= 4) {
4897 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4898 if (bom == 0x0000FEFF) {
4899 bo = -1;
4900 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004902 else if (bom == 0xFFFE0000) {
4903 bo = 1;
4904 q += 4;
4905 }
4906 if (byteorder)
4907 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004908 }
4909
Victor Stinnere64322e2012-10-30 23:12:47 +01004910 if (q == e) {
4911 if (consumed)
4912 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004913 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004914 }
4915
Victor Stinnere64322e2012-10-30 23:12:47 +01004916#ifdef WORDS_BIGENDIAN
4917 le = bo < 0;
4918#else
4919 le = bo <= 0;
4920#endif
4921
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004922 _PyUnicodeWriter_Init(&writer, 0);
4923 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4924 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004925
Victor Stinnere64322e2012-10-30 23:12:47 +01004926 while (1) {
4927 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004928 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004929
Victor Stinnere64322e2012-10-30 23:12:47 +01004930 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004931 enum PyUnicode_Kind kind = writer.kind;
4932 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004933 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004934 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004935 if (le) {
4936 do {
4937 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4938 if (ch > maxch)
4939 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004940 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004941 q += 4;
4942 } while (q <= last);
4943 }
4944 else {
4945 do {
4946 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4947 if (ch > maxch)
4948 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004949 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004950 q += 4;
4951 } while (q <= last);
4952 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004953 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004954 }
4955
4956 if (ch <= maxch) {
4957 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004959 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004961 startinpos = ((const char *)q) - starts;
4962 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004964 else {
4965 if (ch < 0x110000) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004966 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinnere64322e2012-10-30 23:12:47 +01004967 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004968 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4969 writer.pos++;
Victor Stinnere64322e2012-10-30 23:12:47 +01004970 q += 4;
4971 continue;
4972 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004974 startinpos = ((const char *)q) - starts;
4975 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004977
4978 /* The remaining input chars are ignored if the callback
4979 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004980 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 errors, &errorHandler,
4982 "utf32", errmsg,
4983 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004984 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986 }
4987
Walter Dörwald41980ca2007-08-16 21:55:45 +00004988 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991 Py_XDECREF(errorHandler);
4992 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004994
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997 Py_XDECREF(errorHandler);
4998 Py_XDECREF(exc);
4999 return NULL;
5000}
5001
5002PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005003_PyUnicode_EncodeUTF32(PyObject *str,
5004 const char *errors,
5005 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005007 int kind;
5008 void *data;
5009 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005010 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005012 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005013 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005014#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005015 int iorder[] = {0, 1, 2, 3};
5016#else
5017 int iorder[] = {3, 2, 1, 0};
5018#endif
5019
Benjamin Peterson29060642009-01-31 22:14:21 +00005020#define STORECHAR(CH) \
5021 do { \
5022 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5023 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5024 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5025 p[iorder[0]] = (CH) & 0xff; \
5026 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005027 } while(0)
5028
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005029 if (!PyUnicode_Check(str)) {
5030 PyErr_BadArgument();
5031 return NULL;
5032 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005033 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005034 return NULL;
5035 kind = PyUnicode_KIND(str);
5036 data = PyUnicode_DATA(str);
5037 len = PyUnicode_GET_LENGTH(str);
5038
5039 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005040 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005042 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043 if (v == NULL)
5044 return NULL;
5045
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005046 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005049 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005050 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051
5052 if (byteorder == -1) {
5053 /* force LE */
5054 iorder[0] = 0;
5055 iorder[1] = 1;
5056 iorder[2] = 2;
5057 iorder[3] = 3;
5058 }
5059 else if (byteorder == 1) {
5060 /* force BE */
5061 iorder[0] = 3;
5062 iorder[1] = 2;
5063 iorder[2] = 1;
5064 iorder[3] = 0;
5065 }
5066
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005067 for (i = 0; i < len; i++)
5068 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005069
5070 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005071 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072#undef STORECHAR
5073}
5074
Alexander Belopolsky40018472011-02-26 01:02:56 +00005075PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005076PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5077 Py_ssize_t size,
5078 const char *errors,
5079 int byteorder)
5080{
5081 PyObject *result;
5082 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5083 if (tmp == NULL)
5084 return NULL;
5085 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5086 Py_DECREF(tmp);
5087 return result;
5088}
5089
5090PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005091PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092{
Victor Stinnerb960b342011-11-20 19:12:52 +01005093 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094}
5095
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096/* --- UTF-16 Codec ------------------------------------------------------- */
5097
Tim Peters772747b2001-08-09 22:21:55 +00005098PyObject *
5099PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 Py_ssize_t size,
5101 const char *errors,
5102 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103{
Walter Dörwald69652032004-09-07 20:24:22 +00005104 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5105}
5106
5107PyObject *
5108PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 Py_ssize_t size,
5110 const char *errors,
5111 int *byteorder,
5112 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005113{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005115 Py_ssize_t startinpos;
5116 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005117 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005118 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005119 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005120 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005121 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 PyObject *errorHandler = NULL;
5123 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124
Tim Peters772747b2001-08-09 22:21:55 +00005125 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005126 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127
5128 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005129 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005131 /* Check for BOM marks (U+FEFF) in the input and adjust current
5132 byte order setting accordingly. In native mode, the leading BOM
5133 mark is skipped, in all other modes, it is copied to the output
5134 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005135 if (bo == 0 && size >= 2) {
5136 const Py_UCS4 bom = (q[1] << 8) | q[0];
5137 if (bom == 0xFEFF) {
5138 q += 2;
5139 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005141 else if (bom == 0xFFFE) {
5142 q += 2;
5143 bo = 1;
5144 }
5145 if (byteorder)
5146 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148
Antoine Pitrou63065d72012-05-15 23:48:04 +02005149 if (q == e) {
5150 if (consumed)
5151 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005152 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005153 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005154
Christian Heimes743e0cd2012-10-17 23:52:17 +02005155#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005156 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005157#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005158 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005159#endif
Tim Peters772747b2001-08-09 22:21:55 +00005160
Antoine Pitrou63065d72012-05-15 23:48:04 +02005161 /* Note: size will always be longer than the resulting Unicode
5162 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005163 _PyUnicodeWriter_Init(&writer, 0);
5164 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5165 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005166
Antoine Pitrou63065d72012-05-15 23:48:04 +02005167 while (1) {
5168 Py_UCS4 ch = 0;
5169 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005170 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005171 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005172 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005173 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005174 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005175 native_ordering);
5176 else
5177 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005178 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005179 native_ordering);
5180 } else if (kind == PyUnicode_2BYTE_KIND) {
5181 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005182 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005183 native_ordering);
5184 } else {
5185 assert(kind == PyUnicode_4BYTE_KIND);
5186 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005187 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005188 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005189 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005190 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005191
Antoine Pitrou63065d72012-05-15 23:48:04 +02005192 switch (ch)
5193 {
5194 case 0:
5195 /* remaining byte at the end? (size should be even) */
5196 if (q == e || consumed)
5197 goto End;
5198 errmsg = "truncated data";
5199 startinpos = ((const char *)q) - starts;
5200 endinpos = ((const char *)e) - starts;
5201 break;
5202 /* The remaining input chars are ignored if the callback
5203 chooses to skip the input */
5204 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005205 q -= 2;
5206 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005207 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005208 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005209 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005210 endinpos = ((const char *)e) - starts;
5211 break;
5212 case 2:
5213 errmsg = "illegal encoding";
5214 startinpos = ((const char *)q) - 2 - starts;
5215 endinpos = startinpos + 2;
5216 break;
5217 case 3:
5218 errmsg = "illegal UTF-16 surrogate";
5219 startinpos = ((const char *)q) - 4 - starts;
5220 endinpos = startinpos + 2;
5221 break;
5222 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005223 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005224 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005225 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
5226 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 continue;
5228 }
5229
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005230 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005231 errors,
5232 &errorHandler,
5233 "utf16", errmsg,
5234 &starts,
5235 (const char **)&e,
5236 &startinpos,
5237 &endinpos,
5238 &exc,
5239 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005240 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 }
5243
Antoine Pitrou63065d72012-05-15 23:48:04 +02005244End:
Walter Dörwald69652032004-09-07 20:24:22 +00005245 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005248 Py_XDECREF(errorHandler);
5249 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005250 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005253 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005254 Py_XDECREF(errorHandler);
5255 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 return NULL;
5257}
5258
Tim Peters772747b2001-08-09 22:21:55 +00005259PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005260_PyUnicode_EncodeUTF16(PyObject *str,
5261 const char *errors,
5262 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005264 enum PyUnicode_Kind kind;
5265 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005266 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005267 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005268 unsigned short *out;
5269 Py_ssize_t bytesize;
5270 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005271#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005272 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005273#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005274 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005275#endif
5276
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005277 if (!PyUnicode_Check(str)) {
5278 PyErr_BadArgument();
5279 return NULL;
5280 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005281 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005282 return NULL;
5283 kind = PyUnicode_KIND(str);
5284 data = PyUnicode_DATA(str);
5285 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005286
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005287 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005288 if (kind == PyUnicode_4BYTE_KIND) {
5289 const Py_UCS4 *in = (const Py_UCS4 *)data;
5290 const Py_UCS4 *end = in + len;
5291 while (in < end)
5292 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005293 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005294 }
5295 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005297 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005298 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 if (v == NULL)
5300 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005302 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005303 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005304 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005306 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005307 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005308 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005309
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005310 switch (kind) {
5311 case PyUnicode_1BYTE_KIND: {
5312 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5313 break;
Tim Peters772747b2001-08-09 22:21:55 +00005314 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005315 case PyUnicode_2BYTE_KIND: {
5316 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5317 break;
Tim Peters772747b2001-08-09 22:21:55 +00005318 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005319 case PyUnicode_4BYTE_KIND: {
5320 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5321 break;
5322 }
5323 default:
5324 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005325 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005326
5327 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005328 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329}
5330
Alexander Belopolsky40018472011-02-26 01:02:56 +00005331PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005332PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5333 Py_ssize_t size,
5334 const char *errors,
5335 int byteorder)
5336{
5337 PyObject *result;
5338 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5339 if (tmp == NULL)
5340 return NULL;
5341 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5342 Py_DECREF(tmp);
5343 return result;
5344}
5345
5346PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005347PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005349 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350}
5351
5352/* --- Unicode Escape Codec ----------------------------------------------- */
5353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005354/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5355 if all the escapes in the string make it still a valid ASCII string.
5356 Returns -1 if any escapes were found which cause the string to
5357 pop out of ASCII range. Otherwise returns the length of the
5358 required buffer to hold the string.
5359 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005360static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005361length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5362{
5363 const unsigned char *p = (const unsigned char *)s;
5364 const unsigned char *end = p + size;
5365 Py_ssize_t length = 0;
5366
5367 if (size < 0)
5368 return -1;
5369
5370 for (; p < end; ++p) {
5371 if (*p > 127) {
5372 /* Non-ASCII */
5373 return -1;
5374 }
5375 else if (*p != '\\') {
5376 /* Normal character */
5377 ++length;
5378 }
5379 else {
5380 /* Backslash-escape, check next char */
5381 ++p;
5382 /* Escape sequence reaches till end of string or
5383 non-ASCII follow-up. */
5384 if (p >= end || *p > 127)
5385 return -1;
5386 switch (*p) {
5387 case '\n':
5388 /* backslash + \n result in zero characters */
5389 break;
5390 case '\\': case '\'': case '\"':
5391 case 'b': case 'f': case 't':
5392 case 'n': case 'r': case 'v': case 'a':
5393 ++length;
5394 break;
5395 case '0': case '1': case '2': case '3':
5396 case '4': case '5': case '6': case '7':
5397 case 'x': case 'u': case 'U': case 'N':
5398 /* these do not guarantee ASCII characters */
5399 return -1;
5400 default:
5401 /* count the backslash + the other character */
5402 length += 2;
5403 }
5404 }
5405 }
5406 return length;
5407}
5408
Fredrik Lundh06d12682001-01-24 07:59:11 +00005409static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005410
Alexander Belopolsky40018472011-02-26 01:02:56 +00005411PyObject *
5412PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005413 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005414 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005417 Py_ssize_t startinpos;
5418 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005419 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005421 char* message;
5422 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005423 PyObject *errorHandler = NULL;
5424 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005425 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005426
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005427 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005428 if (len == 0)
5429 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005430
5431 /* After length_of_escaped_ascii_string() there are two alternatives,
5432 either the string is pure ASCII with named escapes like \n, etc.
5433 and we determined it's exact size (common case)
5434 or it contains \x, \u, ... escape sequences. then we create a
5435 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005436 _PyUnicodeWriter_Init(&writer, 0);
5437 if (len > 0) {
5438 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005440 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 }
5442 else {
5443 /* Escaped strings will always be longer than the resulting
5444 Unicode string, so we start with size here and then reduce the
5445 length after conversion to the true value.
5446 (but if the error callback returns a long replacement string
5447 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005448 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450 }
5451
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005453 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005455
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 while (s < end) {
5457 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005458 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460
5461 /* Non-escape characters are interpreted as Unicode ordinals */
5462 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005463 x = (unsigned char)*s;
5464 s++;
5465 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005466 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005467 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5468 writer.pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 continue;
5470 }
5471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 /* \ - Escapes */
5474 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005475 c = *s++;
5476 if (s > end)
5477 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005479 /* The only case in which i == ascii_length is a backslash
5480 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005481 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005483 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005486#define WRITECHAR(ch) \
5487 do { \
5488 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) \
5489 goto onError; \
5490 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \
5491 writer.pos++; \
5492 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005493
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005495 case '\\': WRITECHAR('\\'); break;
5496 case '\'': WRITECHAR('\''); break;
5497 case '\"': WRITECHAR('\"'); break;
5498 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005499 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005500 case 'f': WRITECHAR('\014'); break;
5501 case 't': WRITECHAR('\t'); break;
5502 case 'n': WRITECHAR('\n'); break;
5503 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005504 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005505 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005506 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005507 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 case '0': case '1': case '2': case '3':
5511 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005512 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005513 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005514 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005515 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005516 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005518 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 break;
5520
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 /* hex escapes */
5522 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005524 digits = 2;
5525 message = "truncated \\xXX escape";
5526 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005530 digits = 4;
5531 message = "truncated \\uXXXX escape";
5532 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005535 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005536 digits = 8;
5537 message = "truncated \\UXXXXXXXX escape";
5538 hexescape:
5539 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005540 if (end - s < digits) {
5541 /* count only hex digits */
5542 for (; s < end; ++s) {
5543 c = (unsigned char)*s;
5544 if (!Py_ISXDIGIT(c))
5545 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005546 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005547 goto error;
5548 }
5549 for (; digits--; ++s) {
5550 c = (unsigned char)*s;
5551 if (!Py_ISXDIGIT(c))
5552 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005553 chr = (chr<<4) & ~0xF;
5554 if (c >= '0' && c <= '9')
5555 chr += c - '0';
5556 else if (c >= 'a' && c <= 'f')
5557 chr += 10 + c - 'a';
5558 else
5559 chr += 10 + c - 'A';
5560 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005561 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005562 /* _decoding_error will have already written into the
5563 target buffer. */
5564 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005565 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005566 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005567 message = "illegal Unicode character";
5568 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005569 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005570 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005571 break;
5572
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005574 case 'N':
5575 message = "malformed \\N character escape";
5576 if (ucnhash_CAPI == NULL) {
5577 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5579 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005580 if (ucnhash_CAPI == NULL)
5581 goto ucnhashError;
5582 }
5583 if (*s == '{') {
5584 const char *start = s+1;
5585 /* look for the closing brace */
5586 while (*s != '}' && s < end)
5587 s++;
5588 if (s > start && s < end && *s == '}') {
5589 /* found a name. look it up in the unicode database */
5590 message = "unknown Unicode character name";
5591 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005592 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005593 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005594 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005595 goto store;
5596 }
5597 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005598 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005599
5600 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005601 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 message = "\\ at end of string";
5603 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005604 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005605 }
5606 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005607 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005608 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005609 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005610 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005612 continue;
5613
5614 error:
5615 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005616 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005617 errors, &errorHandler,
5618 "unicodeescape", message,
5619 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005620 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005621 goto onError;
5622 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005624#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005626 Py_XDECREF(errorHandler);
5627 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005629
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005631 PyErr_SetString(
5632 PyExc_UnicodeError,
5633 "\\N escapes not supported (can't load unicodedata module)"
5634 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 Py_XDECREF(errorHandler);
5637 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005638 return NULL;
5639
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642 Py_XDECREF(errorHandler);
5643 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 return NULL;
5645}
5646
5647/* Return a Unicode-Escape string version of the Unicode object.
5648
5649 If quotes is true, the string is enclosed in u"" or u'' quotes as
5650 appropriate.
5651
5652*/
5653
Alexander Belopolsky40018472011-02-26 01:02:56 +00005654PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005655PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005658 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660 int kind;
5661 void *data;
5662 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663
Ezio Melottie7f90372012-10-05 03:33:31 +03005664 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005665 escape.
5666
Ezio Melottie7f90372012-10-05 03:33:31 +03005667 For UCS1 strings it's '\xxx', 4 bytes per source character.
5668 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5669 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005670 */
5671
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005672 if (!PyUnicode_Check(unicode)) {
5673 PyErr_BadArgument();
5674 return NULL;
5675 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005676 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005677 return NULL;
5678 len = PyUnicode_GET_LENGTH(unicode);
5679 kind = PyUnicode_KIND(unicode);
5680 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005681 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005682 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5683 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5684 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5685 }
5686
5687 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005688 return PyBytes_FromStringAndSize(NULL, 0);
5689
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005690 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005692
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005693 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005695 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 if (repr == NULL)
5698 return NULL;
5699
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005700 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005702 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005703 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005704
Walter Dörwald79e913e2007-05-12 11:08:06 +00005705 /* Escape backslashes */
5706 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 *p++ = '\\';
5708 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005709 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005710 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005711
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005712 /* Map 21-bit characters to '\U00xxxxxx' */
5713 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005714 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005715 *p++ = '\\';
5716 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005717 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5718 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5719 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5720 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5721 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5722 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5723 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5724 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005726 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005729 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 *p++ = '\\';
5731 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005732 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5733 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5734 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5735 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005737
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005738 /* Map special whitespace to '\t', \n', '\r' */
5739 else if (ch == '\t') {
5740 *p++ = '\\';
5741 *p++ = 't';
5742 }
5743 else if (ch == '\n') {
5744 *p++ = '\\';
5745 *p++ = 'n';
5746 }
5747 else if (ch == '\r') {
5748 *p++ = '\\';
5749 *p++ = 'r';
5750 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005751
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005752 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005753 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005755 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005756 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5757 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005758 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005759
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 /* Copy everything else as-is */
5761 else
5762 *p++ = (char) ch;
5763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005765 assert(p - PyBytes_AS_STRING(repr) > 0);
5766 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5767 return NULL;
5768 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769}
5770
Alexander Belopolsky40018472011-02-26 01:02:56 +00005771PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005772PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5773 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005775 PyObject *result;
5776 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5777 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005779 result = PyUnicode_AsUnicodeEscapeString(tmp);
5780 Py_DECREF(tmp);
5781 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782}
5783
5784/* --- Raw Unicode Escape Codec ------------------------------------------- */
5785
Alexander Belopolsky40018472011-02-26 01:02:56 +00005786PyObject *
5787PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005788 Py_ssize_t size,
5789 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792 Py_ssize_t startinpos;
5793 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005794 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 const char *end;
5796 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 PyObject *errorHandler = NULL;
5798 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005799
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005800 if (size == 0)
5801 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 /* Escaped strings will always be longer than the resulting
5804 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 length after conversion to the true value. (But decoding error
5806 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005807 _PyUnicodeWriter_Init(&writer, 1);
5808 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005810
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 end = s + size;
5812 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 unsigned char c;
5814 Py_UCS4 x;
5815 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005816 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 /* Non-escape characters are interpreted as Unicode ordinals */
5819 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005820 x = (unsigned char)*s++;
5821 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005822 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005823 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5824 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005826 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 startinpos = s-starts;
5828
5829 /* \u-escapes are only interpreted iff the number of leading
5830 backslashes if odd */
5831 bs = s;
5832 for (;s < end;) {
5833 if (*s != '\\')
5834 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005835 x = (unsigned char)*s++;
5836 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005837 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005838 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5839 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 }
5841 if (((s - bs) & 1) == 0 ||
5842 s >= end ||
5843 (*s != 'u' && *s != 'U')) {
5844 continue;
5845 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005846 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 count = *s=='u' ? 4 : 8;
5848 s++;
5849
5850 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 for (x = 0, i = 0; i < count; ++i, ++s) {
5852 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005853 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 errors, &errorHandler,
5857 "rawunicodeescape", "truncated \\uXXXX",
5858 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005859 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 goto onError;
5861 goto nextByte;
5862 }
5863 x = (x<<4) & ~0xF;
5864 if (c >= '0' && c <= '9')
5865 x += c - '0';
5866 else if (c >= 'a' && c <= 'f')
5867 x += 10 + c - 'a';
5868 else
5869 x += 10 + c - 'A';
5870 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005871 if (x <= MAX_UNICODE) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005872 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005873 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005874 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5875 writer.pos++;
5876 }
5877 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005878 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005880 errors, &errorHandler,
5881 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005883 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005885 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 nextByte:
5887 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 Py_XDECREF(errorHandler);
5890 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005891 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005892
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 return NULL;
5898}
5899
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900
Alexander Belopolsky40018472011-02-26 01:02:56 +00005901PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005904 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 char *p;
5906 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 Py_ssize_t expandsize, pos;
5908 int kind;
5909 void *data;
5910 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005912 if (!PyUnicode_Check(unicode)) {
5913 PyErr_BadArgument();
5914 return NULL;
5915 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005916 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005917 return NULL;
5918 kind = PyUnicode_KIND(unicode);
5919 data = PyUnicode_DATA(unicode);
5920 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005921 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5922 bytes, and 1 byte characters 4. */
5923 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005924
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005925 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005927
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 if (repr == NULL)
5930 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005932 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005934 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005935 for (pos = 0; pos < len; pos++) {
5936 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 /* Map 32-bit characters to '\Uxxxxxxxx' */
5938 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005939 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005940 *p++ = '\\';
5941 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005942 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5943 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5944 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5945 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5946 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5947 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5948 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5949 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005950 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 *p++ = '\\';
5954 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005955 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5956 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5957 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5958 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 /* Copy everything else as-is */
5961 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 *p++ = (char) ch;
5963 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005964
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005965 assert(p > q);
5966 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005967 return NULL;
5968 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969}
5970
Alexander Belopolsky40018472011-02-26 01:02:56 +00005971PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5973 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005975 PyObject *result;
5976 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5977 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005978 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005979 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5980 Py_DECREF(tmp);
5981 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982}
5983
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005984/* --- Unicode Internal Codec ------------------------------------------- */
5985
Alexander Belopolsky40018472011-02-26 01:02:56 +00005986PyObject *
5987_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005988 Py_ssize_t size,
5989 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005990{
5991 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005992 Py_ssize_t startinpos;
5993 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005994 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005995 const char *end;
5996 const char *reason;
5997 PyObject *errorHandler = NULL;
5998 PyObject *exc = NULL;
5999
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006000 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006001 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006002 1))
6003 return NULL;
6004
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006005 if (size == 0)
6006 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006007
Thomas Wouters89f507f2006-12-13 04:49:30 +00006008 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006009 _PyUnicodeWriter_Init(&writer, 0);
6010 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006012 end = s + size;
6013
6014 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006015 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006016 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006017 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006018 endinpos = end-starts;
6019 reason = "truncated input";
6020 goto error;
6021 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006022 /* We copy the raw representation one byte at a time because the
6023 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006024 ((char *) &uch)[0] = s[0];
6025 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006026#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006027 ((char *) &uch)[2] = s[2];
6028 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006029#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006030 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006031#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006032 /* We have to sanity check the raw data, otherwise doom looms for
6033 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006034 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006035 endinpos = s - starts + Py_UNICODE_SIZE;
6036 reason = "illegal code point (> 0x10FFFF)";
6037 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006038 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006039#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006040 s += Py_UNICODE_SIZE;
6041#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006042 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006043 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006044 Py_UNICODE uch2;
6045 ((char *) &uch2)[0] = s[0];
6046 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006047 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006048 {
Victor Stinner551ac952011-11-29 22:58:13 +01006049 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006050 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006051 }
6052 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006053#endif
6054
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006055 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006056 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006057 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
6058 writer.pos++;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006059 continue;
6060
6061 error:
6062 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006063 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006064 errors, &errorHandler,
6065 "unicode_internal", reason,
6066 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006067 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006068 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006069 }
6070
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006071 Py_XDECREF(errorHandler);
6072 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006073 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006074
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006076 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006077 Py_XDECREF(errorHandler);
6078 Py_XDECREF(exc);
6079 return NULL;
6080}
6081
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082/* --- Latin-1 Codec ------------------------------------------------------ */
6083
Alexander Belopolsky40018472011-02-26 01:02:56 +00006084PyObject *
6085PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006086 Py_ssize_t size,
6087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006090 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091}
6092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006094static void
6095make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006096 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006097 PyObject *unicode,
6098 Py_ssize_t startpos, Py_ssize_t endpos,
6099 const char *reason)
6100{
6101 if (*exceptionObject == NULL) {
6102 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006103 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006104 encoding, unicode, startpos, endpos, reason);
6105 }
6106 else {
6107 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6108 goto onError;
6109 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6110 goto onError;
6111 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6112 goto onError;
6113 return;
6114 onError:
6115 Py_DECREF(*exceptionObject);
6116 *exceptionObject = NULL;
6117 }
6118}
6119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006121static void
6122raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006123 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006124 PyObject *unicode,
6125 Py_ssize_t startpos, Py_ssize_t endpos,
6126 const char *reason)
6127{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006128 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006129 encoding, unicode, startpos, endpos, reason);
6130 if (*exceptionObject != NULL)
6131 PyCodec_StrictErrors(*exceptionObject);
6132}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133
6134/* error handling callback helper:
6135 build arguments, call the callback and check the arguments,
6136 put the result into newpos and return the replacement string, which
6137 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006138static PyObject *
6139unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006140 PyObject **errorHandler,
6141 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006142 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006143 Py_ssize_t startpos, Py_ssize_t endpos,
6144 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006146 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006147 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 PyObject *restuple;
6149 PyObject *resunicode;
6150
6151 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006155 }
6156
Benjamin Petersonbac79492012-01-14 13:34:47 -05006157 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006158 return NULL;
6159 len = PyUnicode_GET_LENGTH(unicode);
6160
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006161 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006162 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165
6166 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006168 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006171 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 Py_DECREF(restuple);
6173 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006175 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 &resunicode, newpos)) {
6177 Py_DECREF(restuple);
6178 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006180 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6181 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6182 Py_DECREF(restuple);
6183 return NULL;
6184 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006186 *newpos = len + *newpos;
6187 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6189 Py_DECREF(restuple);
6190 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006191 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 Py_INCREF(resunicode);
6193 Py_DECREF(restuple);
6194 return resunicode;
6195}
6196
Alexander Belopolsky40018472011-02-26 01:02:56 +00006197static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006198unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006199 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006200 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006202 /* input state */
6203 Py_ssize_t pos=0, size;
6204 int kind;
6205 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006206 /* output object */
6207 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 /* pointer into the output */
6209 char *str;
6210 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006211 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006212 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6213 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 PyObject *errorHandler = NULL;
6215 PyObject *exc = NULL;
6216 /* the following variable is used for caching string comparisons
6217 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6218 int known_errorHandler = -1;
6219
Benjamin Petersonbac79492012-01-14 13:34:47 -05006220 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006221 return NULL;
6222 size = PyUnicode_GET_LENGTH(unicode);
6223 kind = PyUnicode_KIND(unicode);
6224 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006225 /* allocate enough for a simple encoding without
6226 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006227 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006228 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006229 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006230 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006231 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006232 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006233 ressize = size;
6234
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006235 while (pos < size) {
6236 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006237
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 /* can we encode this? */
6239 if (c<limit) {
6240 /* no overflow check, because we know that the space is enough */
6241 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006242 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006243 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 Py_ssize_t requiredsize;
6246 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006247 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006249 Py_ssize_t collstart = pos;
6250 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006252 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 ++collend;
6254 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6255 if (known_errorHandler==-1) {
6256 if ((errors==NULL) || (!strcmp(errors, "strict")))
6257 known_errorHandler = 1;
6258 else if (!strcmp(errors, "replace"))
6259 known_errorHandler = 2;
6260 else if (!strcmp(errors, "ignore"))
6261 known_errorHandler = 3;
6262 else if (!strcmp(errors, "xmlcharrefreplace"))
6263 known_errorHandler = 4;
6264 else
6265 known_errorHandler = 0;
6266 }
6267 switch (known_errorHandler) {
6268 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006269 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 goto onError;
6271 case 2: /* replace */
6272 while (collstart++<collend)
6273 *str++ = '?'; /* fall through */
6274 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 break;
6277 case 4: /* xmlcharrefreplace */
6278 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006279 /* determine replacement size */
6280 for (i = collstart, repsize = 0; i < collend; ++i) {
6281 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6282 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006284 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006286 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006288 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006290 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006292 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006294 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006295 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006297 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006299 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 if (requiredsize > ressize) {
6301 if (requiredsize<2*ressize)
6302 requiredsize = 2*ressize;
6303 if (_PyBytes_Resize(&res, requiredsize))
6304 goto onError;
6305 str = PyBytes_AS_STRING(res) + respos;
6306 ressize = requiredsize;
6307 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006308 /* generate replacement */
6309 for (i = collstart; i < collend; ++i) {
6310 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006312 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 break;
6314 default:
6315 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006316 encoding, reason, unicode, &exc,
6317 collstart, collend, &newpos);
6318 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006319 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006321 if (PyBytes_Check(repunicode)) {
6322 /* Directly copy bytes result to output. */
6323 repsize = PyBytes_Size(repunicode);
6324 if (repsize > 1) {
6325 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006326 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006327 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6328 Py_DECREF(repunicode);
6329 goto onError;
6330 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006331 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006332 ressize += repsize-1;
6333 }
6334 memcpy(str, PyBytes_AsString(repunicode), repsize);
6335 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006336 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006337 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006338 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006339 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 /* need more space? (at least enough for what we
6341 have+the replacement+the rest of the string, so
6342 we won't have to check space for encodable characters) */
6343 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006344 repsize = PyUnicode_GET_LENGTH(repunicode);
6345 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 if (requiredsize > ressize) {
6347 if (requiredsize<2*ressize)
6348 requiredsize = 2*ressize;
6349 if (_PyBytes_Resize(&res, requiredsize)) {
6350 Py_DECREF(repunicode);
6351 goto onError;
6352 }
6353 str = PyBytes_AS_STRING(res) + respos;
6354 ressize = requiredsize;
6355 }
6356 /* check if there is anything unencodable in the replacement
6357 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358 for (i = 0; repsize-->0; ++i, ++str) {
6359 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006361 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006362 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 Py_DECREF(repunicode);
6364 goto onError;
6365 }
6366 *str = (char)c;
6367 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006369 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006370 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006371 }
6372 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006373 /* Resize if we allocated to much */
6374 size = str - PyBytes_AS_STRING(res);
6375 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006376 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006377 if (_PyBytes_Resize(&res, size) < 0)
6378 goto onError;
6379 }
6380
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381 Py_XDECREF(errorHandler);
6382 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006383 return res;
6384
6385 onError:
6386 Py_XDECREF(res);
6387 Py_XDECREF(errorHandler);
6388 Py_XDECREF(exc);
6389 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390}
6391
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006393PyObject *
6394PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006395 Py_ssize_t size,
6396 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 PyObject *result;
6399 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6400 if (unicode == NULL)
6401 return NULL;
6402 result = unicode_encode_ucs1(unicode, errors, 256);
6403 Py_DECREF(unicode);
6404 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
Alexander Belopolsky40018472011-02-26 01:02:56 +00006407PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006408_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409{
6410 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 PyErr_BadArgument();
6412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006414 if (PyUnicode_READY(unicode) == -1)
6415 return NULL;
6416 /* Fast path: if it is a one-byte string, construct
6417 bytes object directly. */
6418 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6419 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6420 PyUnicode_GET_LENGTH(unicode));
6421 /* Non-Latin-1 characters present. Defer to above function to
6422 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006423 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006424}
6425
6426PyObject*
6427PyUnicode_AsLatin1String(PyObject *unicode)
6428{
6429 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430}
6431
6432/* --- 7-bit ASCII Codec -------------------------------------------------- */
6433
Alexander Belopolsky40018472011-02-26 01:02:56 +00006434PyObject *
6435PyUnicode_DecodeASCII(const char *s,
6436 Py_ssize_t size,
6437 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006440 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006441 int kind;
6442 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006443 Py_ssize_t startinpos;
6444 Py_ssize_t endinpos;
6445 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 const char *e;
6447 PyObject *errorHandler = NULL;
6448 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006449
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006451 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006454 if (size == 1 && (unsigned char)s[0] < 128)
6455 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006456
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006457 _PyUnicodeWriter_Init(&writer, 0);
6458 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006462 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006463 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006464 writer.pos = outpos;
6465 if (writer.pos == size)
6466 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006467
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006468 s += writer.pos;
6469 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 register unsigned char c = (unsigned char)*s;
6472 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006473 PyUnicode_WRITE(kind, data, writer.pos, c);
6474 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 ++s;
6476 }
6477 else {
6478 startinpos = s-starts;
6479 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006480 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 errors, &errorHandler,
6482 "ascii", "ordinal not in range(128)",
6483 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006484 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006486 kind = writer.kind;
6487 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490 Py_XDECREF(errorHandler);
6491 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006492 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006493
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006495 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496 Py_XDECREF(errorHandler);
6497 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 return NULL;
6499}
6500
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502PyObject *
6503PyUnicode_EncodeASCII(const Py_UNICODE *p,
6504 Py_ssize_t size,
6505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 PyObject *result;
6508 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6509 if (unicode == NULL)
6510 return NULL;
6511 result = unicode_encode_ucs1(unicode, errors, 128);
6512 Py_DECREF(unicode);
6513 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514}
6515
Alexander Belopolsky40018472011-02-26 01:02:56 +00006516PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006517_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518{
6519 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 PyErr_BadArgument();
6521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006523 if (PyUnicode_READY(unicode) == -1)
6524 return NULL;
6525 /* Fast path: if it is an ASCII-only string, construct bytes object
6526 directly. Else defer to above function to raise the exception. */
6527 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6528 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6529 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006531}
6532
6533PyObject *
6534PyUnicode_AsASCIIString(PyObject *unicode)
6535{
6536 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537}
6538
Victor Stinner99b95382011-07-04 14:23:54 +02006539#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006540
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006541/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006542
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006543#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006544#define NEED_RETRY
6545#endif
6546
Victor Stinner3a50e702011-10-18 21:21:00 +02006547#ifndef WC_ERR_INVALID_CHARS
6548# define WC_ERR_INVALID_CHARS 0x0080
6549#endif
6550
6551static char*
6552code_page_name(UINT code_page, PyObject **obj)
6553{
6554 *obj = NULL;
6555 if (code_page == CP_ACP)
6556 return "mbcs";
6557 if (code_page == CP_UTF7)
6558 return "CP_UTF7";
6559 if (code_page == CP_UTF8)
6560 return "CP_UTF8";
6561
6562 *obj = PyBytes_FromFormat("cp%u", code_page);
6563 if (*obj == NULL)
6564 return NULL;
6565 return PyBytes_AS_STRING(*obj);
6566}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006567
Alexander Belopolsky40018472011-02-26 01:02:56 +00006568static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006569is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006570{
6571 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006572 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573
Victor Stinner3a50e702011-10-18 21:21:00 +02006574 if (!IsDBCSLeadByteEx(code_page, *curr))
6575 return 0;
6576
6577 prev = CharPrevExA(code_page, s, curr, 0);
6578 if (prev == curr)
6579 return 1;
6580 /* FIXME: This code is limited to "true" double-byte encodings,
6581 as it assumes an incomplete character consists of a single
6582 byte. */
6583 if (curr - prev == 2)
6584 return 1;
6585 if (!IsDBCSLeadByteEx(code_page, *prev))
6586 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006587 return 0;
6588}
6589
Victor Stinner3a50e702011-10-18 21:21:00 +02006590static DWORD
6591decode_code_page_flags(UINT code_page)
6592{
6593 if (code_page == CP_UTF7) {
6594 /* The CP_UTF7 decoder only supports flags=0 */
6595 return 0;
6596 }
6597 else
6598 return MB_ERR_INVALID_CHARS;
6599}
6600
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006601/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006602 * Decode a byte string from a Windows code page into unicode object in strict
6603 * mode.
6604 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006605 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6606 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006607 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006608static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006609decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006610 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006611 const char *in,
6612 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006613{
Victor Stinner3a50e702011-10-18 21:21:00 +02006614 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006615 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006616 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617
6618 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006619 assert(insize > 0);
6620 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6621 if (outsize <= 0)
6622 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006623
6624 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006626 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006627 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 if (*v == NULL)
6629 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006630 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006631 }
6632 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006634 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006635 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006637 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638 }
6639
6640 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006641 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6642 if (outsize <= 0)
6643 goto error;
6644 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006645
Victor Stinner3a50e702011-10-18 21:21:00 +02006646error:
6647 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6648 return -2;
6649 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006650 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006651}
6652
Victor Stinner3a50e702011-10-18 21:21:00 +02006653/*
6654 * Decode a byte string from a code page into unicode object with an error
6655 * handler.
6656 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006657 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006658 * UnicodeDecodeError exception and returns -1 on error.
6659 */
6660static int
6661decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006662 PyObject **v,
6663 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006664 const char *errors)
6665{
6666 const char *startin = in;
6667 const char *endin = in + size;
6668 const DWORD flags = decode_code_page_flags(code_page);
6669 /* Ideally, we should get reason from FormatMessage. This is the Windows
6670 2000 English version of the message. */
6671 const char *reason = "No mapping for the Unicode character exists "
6672 "in the target code page.";
6673 /* each step cannot decode more than 1 character, but a character can be
6674 represented as a surrogate pair */
6675 wchar_t buffer[2], *startout, *out;
6676 int insize, outsize;
6677 PyObject *errorHandler = NULL;
6678 PyObject *exc = NULL;
6679 PyObject *encoding_obj = NULL;
6680 char *encoding;
6681 DWORD err;
6682 int ret = -1;
6683
6684 assert(size > 0);
6685
6686 encoding = code_page_name(code_page, &encoding_obj);
6687 if (encoding == NULL)
6688 return -1;
6689
6690 if (errors == NULL || strcmp(errors, "strict") == 0) {
6691 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6692 UnicodeDecodeError. */
6693 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6694 if (exc != NULL) {
6695 PyCodec_StrictErrors(exc);
6696 Py_CLEAR(exc);
6697 }
6698 goto error;
6699 }
6700
6701 if (*v == NULL) {
6702 /* Create unicode object */
6703 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6704 PyErr_NoMemory();
6705 goto error;
6706 }
Victor Stinnerab595942011-12-17 04:59:06 +01006707 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006708 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006709 if (*v == NULL)
6710 goto error;
6711 startout = PyUnicode_AS_UNICODE(*v);
6712 }
6713 else {
6714 /* Extend unicode object */
6715 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6716 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6717 PyErr_NoMemory();
6718 goto error;
6719 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006720 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006721 goto error;
6722 startout = PyUnicode_AS_UNICODE(*v) + n;
6723 }
6724
6725 /* Decode the byte string character per character */
6726 out = startout;
6727 while (in < endin)
6728 {
6729 /* Decode a character */
6730 insize = 1;
6731 do
6732 {
6733 outsize = MultiByteToWideChar(code_page, flags,
6734 in, insize,
6735 buffer, Py_ARRAY_LENGTH(buffer));
6736 if (outsize > 0)
6737 break;
6738 err = GetLastError();
6739 if (err != ERROR_NO_UNICODE_TRANSLATION
6740 && err != ERROR_INSUFFICIENT_BUFFER)
6741 {
6742 PyErr_SetFromWindowsErr(0);
6743 goto error;
6744 }
6745 insize++;
6746 }
6747 /* 4=maximum length of a UTF-8 sequence */
6748 while (insize <= 4 && (in + insize) <= endin);
6749
6750 if (outsize <= 0) {
6751 Py_ssize_t startinpos, endinpos, outpos;
6752
6753 startinpos = in - startin;
6754 endinpos = startinpos + 1;
6755 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006756 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006757 errors, &errorHandler,
6758 encoding, reason,
6759 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006760 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006761 {
6762 goto error;
6763 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006764 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006765 }
6766 else {
6767 in += insize;
6768 memcpy(out, buffer, outsize * sizeof(wchar_t));
6769 out += outsize;
6770 }
6771 }
6772
6773 /* write a NUL character at the end */
6774 *out = 0;
6775
6776 /* Extend unicode object */
6777 outsize = out - startout;
6778 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006779 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006780 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006781 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006782
6783error:
6784 Py_XDECREF(encoding_obj);
6785 Py_XDECREF(errorHandler);
6786 Py_XDECREF(exc);
6787 return ret;
6788}
6789
Victor Stinner3a50e702011-10-18 21:21:00 +02006790static PyObject *
6791decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006792 const char *s, Py_ssize_t size,
6793 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794{
Victor Stinner76a31a62011-11-04 00:05:13 +01006795 PyObject *v = NULL;
6796 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006797
Victor Stinner3a50e702011-10-18 21:21:00 +02006798 if (code_page < 0) {
6799 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6800 return NULL;
6801 }
6802
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006805
Victor Stinner76a31a62011-11-04 00:05:13 +01006806 do
6807 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006808#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006809 if (size > INT_MAX) {
6810 chunk_size = INT_MAX;
6811 final = 0;
6812 done = 0;
6813 }
6814 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006815#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006816 {
6817 chunk_size = (int)size;
6818 final = (consumed == NULL);
6819 done = 1;
6820 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006821
Victor Stinner76a31a62011-11-04 00:05:13 +01006822 /* Skip trailing lead-byte unless 'final' is set */
6823 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6824 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825
Victor Stinner76a31a62011-11-04 00:05:13 +01006826 if (chunk_size == 0 && done) {
6827 if (v != NULL)
6828 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006829 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006830 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831
Victor Stinner76a31a62011-11-04 00:05:13 +01006832
6833 converted = decode_code_page_strict(code_page, &v,
6834 s, chunk_size);
6835 if (converted == -2)
6836 converted = decode_code_page_errors(code_page, &v,
6837 s, chunk_size,
6838 errors);
6839 assert(converted != 0);
6840
6841 if (converted < 0) {
6842 Py_XDECREF(v);
6843 return NULL;
6844 }
6845
6846 if (consumed)
6847 *consumed += converted;
6848
6849 s += converted;
6850 size -= converted;
6851 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006852
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006853 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006854}
6855
Alexander Belopolsky40018472011-02-26 01:02:56 +00006856PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006857PyUnicode_DecodeCodePageStateful(int code_page,
6858 const char *s,
6859 Py_ssize_t size,
6860 const char *errors,
6861 Py_ssize_t *consumed)
6862{
6863 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6864}
6865
6866PyObject *
6867PyUnicode_DecodeMBCSStateful(const char *s,
6868 Py_ssize_t size,
6869 const char *errors,
6870 Py_ssize_t *consumed)
6871{
6872 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6873}
6874
6875PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006876PyUnicode_DecodeMBCS(const char *s,
6877 Py_ssize_t size,
6878 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006879{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6881}
6882
Victor Stinner3a50e702011-10-18 21:21:00 +02006883static DWORD
6884encode_code_page_flags(UINT code_page, const char *errors)
6885{
6886 if (code_page == CP_UTF8) {
6887 if (winver.dwMajorVersion >= 6)
6888 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6889 and later */
6890 return WC_ERR_INVALID_CHARS;
6891 else
6892 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6893 return 0;
6894 }
6895 else if (code_page == CP_UTF7) {
6896 /* CP_UTF7 only supports flags=0 */
6897 return 0;
6898 }
6899 else {
6900 if (errors != NULL && strcmp(errors, "replace") == 0)
6901 return 0;
6902 else
6903 return WC_NO_BEST_FIT_CHARS;
6904 }
6905}
6906
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 * Encode a Unicode string to a Windows code page into a byte string in strict
6909 * mode.
6910 *
6911 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006912 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006913 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006914static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006915encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006916 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918{
Victor Stinner554f3f02010-06-16 23:33:54 +00006919 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006920 BOOL *pusedDefaultChar = &usedDefaultChar;
6921 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006922 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006923 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006924 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006925 const DWORD flags = encode_code_page_flags(code_page, NULL);
6926 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006927 /* Create a substring so that we can get the UTF-16 representation
6928 of just the slice under consideration. */
6929 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006930
Martin v. Löwis3d325192011-11-04 18:23:06 +01006931 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006932
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006934 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006935 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006936 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006937
Victor Stinner2fc507f2011-11-04 20:06:39 +01006938 substring = PyUnicode_Substring(unicode, offset, offset+len);
6939 if (substring == NULL)
6940 return -1;
6941 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6942 if (p == NULL) {
6943 Py_DECREF(substring);
6944 return -1;
6945 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006946
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006947 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 outsize = WideCharToMultiByte(code_page, flags,
6949 p, size,
6950 NULL, 0,
6951 NULL, pusedDefaultChar);
6952 if (outsize <= 0)
6953 goto error;
6954 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006955 if (pusedDefaultChar && *pusedDefaultChar) {
6956 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006958 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006959
Victor Stinner3a50e702011-10-18 21:21:00 +02006960 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006962 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006963 if (*outbytes == NULL) {
6964 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006966 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006967 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968 }
6969 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 const Py_ssize_t n = PyBytes_Size(*outbytes);
6972 if (outsize > PY_SSIZE_T_MAX - n) {
6973 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006974 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006976 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006977 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6978 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006980 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982 }
6983
6984 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006985 outsize = WideCharToMultiByte(code_page, flags,
6986 p, size,
6987 out, outsize,
6988 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006989 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006990 if (outsize <= 0)
6991 goto error;
6992 if (pusedDefaultChar && *pusedDefaultChar)
6993 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006995
Victor Stinner3a50e702011-10-18 21:21:00 +02006996error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006997 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006998 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6999 return -2;
7000 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007001 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007002}
7003
Victor Stinner3a50e702011-10-18 21:21:00 +02007004/*
7005 * Encode a Unicode string to a Windows code page into a byte string using a
7006 * error handler.
7007 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007008 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 * -1 on other error.
7010 */
7011static int
7012encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007013 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007014 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007015{
Victor Stinner3a50e702011-10-18 21:21:00 +02007016 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007017 Py_ssize_t pos = unicode_offset;
7018 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007019 /* Ideally, we should get reason from FormatMessage. This is the Windows
7020 2000 English version of the message. */
7021 const char *reason = "invalid character";
7022 /* 4=maximum length of a UTF-8 sequence */
7023 char buffer[4];
7024 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7025 Py_ssize_t outsize;
7026 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007027 PyObject *errorHandler = NULL;
7028 PyObject *exc = NULL;
7029 PyObject *encoding_obj = NULL;
7030 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007031 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007032 PyObject *rep;
7033 int ret = -1;
7034
7035 assert(insize > 0);
7036
7037 encoding = code_page_name(code_page, &encoding_obj);
7038 if (encoding == NULL)
7039 return -1;
7040
7041 if (errors == NULL || strcmp(errors, "strict") == 0) {
7042 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7043 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007044 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007045 if (exc != NULL) {
7046 PyCodec_StrictErrors(exc);
7047 Py_DECREF(exc);
7048 }
7049 Py_XDECREF(encoding_obj);
7050 return -1;
7051 }
7052
7053 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7054 pusedDefaultChar = &usedDefaultChar;
7055 else
7056 pusedDefaultChar = NULL;
7057
7058 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7059 PyErr_NoMemory();
7060 goto error;
7061 }
7062 outsize = insize * Py_ARRAY_LENGTH(buffer);
7063
7064 if (*outbytes == NULL) {
7065 /* Create string object */
7066 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7067 if (*outbytes == NULL)
7068 goto error;
7069 out = PyBytes_AS_STRING(*outbytes);
7070 }
7071 else {
7072 /* Extend string object */
7073 Py_ssize_t n = PyBytes_Size(*outbytes);
7074 if (n > PY_SSIZE_T_MAX - outsize) {
7075 PyErr_NoMemory();
7076 goto error;
7077 }
7078 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7079 goto error;
7080 out = PyBytes_AS_STRING(*outbytes) + n;
7081 }
7082
7083 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007084 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007085 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007086 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7087 wchar_t chars[2];
7088 int charsize;
7089 if (ch < 0x10000) {
7090 chars[0] = (wchar_t)ch;
7091 charsize = 1;
7092 }
7093 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007094 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7095 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007096 charsize = 2;
7097 }
7098
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007100 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 buffer, Py_ARRAY_LENGTH(buffer),
7102 NULL, pusedDefaultChar);
7103 if (outsize > 0) {
7104 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7105 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007106 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 memcpy(out, buffer, outsize);
7108 out += outsize;
7109 continue;
7110 }
7111 }
7112 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7113 PyErr_SetFromWindowsErr(0);
7114 goto error;
7115 }
7116
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 rep = unicode_encode_call_errorhandler(
7118 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007119 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007120 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 if (rep == NULL)
7122 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007123 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007124
7125 if (PyBytes_Check(rep)) {
7126 outsize = PyBytes_GET_SIZE(rep);
7127 if (outsize != 1) {
7128 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7129 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7130 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7131 Py_DECREF(rep);
7132 goto error;
7133 }
7134 out = PyBytes_AS_STRING(*outbytes) + offset;
7135 }
7136 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7137 out += outsize;
7138 }
7139 else {
7140 Py_ssize_t i;
7141 enum PyUnicode_Kind kind;
7142 void *data;
7143
Benjamin Petersonbac79492012-01-14 13:34:47 -05007144 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 Py_DECREF(rep);
7146 goto error;
7147 }
7148
7149 outsize = PyUnicode_GET_LENGTH(rep);
7150 if (outsize != 1) {
7151 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7152 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7153 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7154 Py_DECREF(rep);
7155 goto error;
7156 }
7157 out = PyBytes_AS_STRING(*outbytes) + offset;
7158 }
7159 kind = PyUnicode_KIND(rep);
7160 data = PyUnicode_DATA(rep);
7161 for (i=0; i < outsize; i++) {
7162 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7163 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007164 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007165 encoding, unicode,
7166 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 "unable to encode error handler result to ASCII");
7168 Py_DECREF(rep);
7169 goto error;
7170 }
7171 *out = (unsigned char)ch;
7172 out++;
7173 }
7174 }
7175 Py_DECREF(rep);
7176 }
7177 /* write a NUL byte */
7178 *out = 0;
7179 outsize = out - PyBytes_AS_STRING(*outbytes);
7180 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7181 if (_PyBytes_Resize(outbytes, outsize) < 0)
7182 goto error;
7183 ret = 0;
7184
7185error:
7186 Py_XDECREF(encoding_obj);
7187 Py_XDECREF(errorHandler);
7188 Py_XDECREF(exc);
7189 return ret;
7190}
7191
Victor Stinner3a50e702011-10-18 21:21:00 +02007192static PyObject *
7193encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007194 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 const char *errors)
7196{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007197 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007199 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007200 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007201
Benjamin Petersonbac79492012-01-14 13:34:47 -05007202 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007203 return NULL;
7204 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007205
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 if (code_page < 0) {
7207 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7208 return NULL;
7209 }
7210
Martin v. Löwis3d325192011-11-04 18:23:06 +01007211 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007212 return PyBytes_FromStringAndSize(NULL, 0);
7213
Victor Stinner7581cef2011-11-03 22:32:33 +01007214 offset = 0;
7215 do
7216 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007218 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007219 chunks. */
7220 if (len > INT_MAX/2) {
7221 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007222 done = 0;
7223 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007224 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007225#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007226 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007227 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007228 done = 1;
7229 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230
Victor Stinner76a31a62011-11-04 00:05:13 +01007231 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007232 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007233 errors);
7234 if (ret == -2)
7235 ret = encode_code_page_errors(code_page, &outbytes,
7236 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007237 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007238 if (ret < 0) {
7239 Py_XDECREF(outbytes);
7240 return NULL;
7241 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007242
Victor Stinner7581cef2011-11-03 22:32:33 +01007243 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007244 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007245 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007246
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 return outbytes;
7248}
7249
7250PyObject *
7251PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7252 Py_ssize_t size,
7253 const char *errors)
7254{
Victor Stinner7581cef2011-11-03 22:32:33 +01007255 PyObject *unicode, *res;
7256 unicode = PyUnicode_FromUnicode(p, size);
7257 if (unicode == NULL)
7258 return NULL;
7259 res = encode_code_page(CP_ACP, unicode, errors);
7260 Py_DECREF(unicode);
7261 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007262}
7263
7264PyObject *
7265PyUnicode_EncodeCodePage(int code_page,
7266 PyObject *unicode,
7267 const char *errors)
7268{
Victor Stinner7581cef2011-11-03 22:32:33 +01007269 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007270}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007271
Alexander Belopolsky40018472011-02-26 01:02:56 +00007272PyObject *
7273PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007274{
7275 if (!PyUnicode_Check(unicode)) {
7276 PyErr_BadArgument();
7277 return NULL;
7278 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007279 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007280}
7281
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007282#undef NEED_RETRY
7283
Victor Stinner99b95382011-07-04 14:23:54 +02007284#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007285
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286/* --- Character Mapping Codec -------------------------------------------- */
7287
Alexander Belopolsky40018472011-02-26 01:02:56 +00007288PyObject *
7289PyUnicode_DecodeCharmap(const char *s,
7290 Py_ssize_t size,
7291 PyObject *mapping,
7292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007294 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007295 Py_ssize_t startinpos;
7296 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007297 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007298 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299 PyObject *errorHandler = NULL;
7300 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007301
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 /* Default to Latin-1 */
7303 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007307 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007308 _PyUnicodeWriter_Init(&writer, 0);
7309 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007313 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007314 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007315 enum PyUnicode_Kind mapkind;
7316 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007317 Py_UCS4 x;
Victor Stinner03c3e352013-04-09 21:53:09 +02007318 unsigned char ch;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007319
Benjamin Petersonbac79492012-01-14 13:34:47 -05007320 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007321 return NULL;
7322
7323 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007324 mapdata = PyUnicode_DATA(mapping);
7325 mapkind = PyUnicode_KIND(mapping);
Victor Stinner03c3e352013-04-09 21:53:09 +02007326
7327 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7328 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7329 * is disabled in encoding aliases, latin1 is preferred because
7330 * its implementation is faster. */
7331 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7332 Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
7333 Py_UCS4 maxchar = writer.maxchar;
7334
7335 assert (writer.kind == PyUnicode_1BYTE_KIND);
7336 while (s < e) {
7337 ch = *s;
7338 x = mapdata_ucs1[ch];
7339 if (x > maxchar) {
7340 if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
7341 goto onError;
7342 maxchar = writer.maxchar;
7343 outdata = (Py_UCS1 *)writer.data;
7344 }
7345 outdata[writer.pos] = x;
7346 writer.pos++;
7347 ++s;
7348 }
7349 }
7350
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007352 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007353 enum PyUnicode_Kind outkind = writer.kind;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007354 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007355 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007356 Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007357 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007358 while (s < e) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007359 ch = *s;
7360 x = mapdata_ucs2[ch];
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007361 if (x > maxchar)
7362 goto Error;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007363 outdata[writer.pos] = x;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007364 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007365 ++s;
7366 }
7367 break;
7368 }
7369 else if (outkind == PyUnicode_2BYTE_KIND) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007370 Py_UCS2 *outdata = (Py_UCS2 *)writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007371 while (s < e) {
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007372 ch = *s;
7373 x = mapdata_ucs2[ch];
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007374 if (x == 0xFFFE)
7375 goto Error;
Victor Stinner63d5c1a2013-04-09 22:13:33 +02007376 outdata[writer.pos] = x;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007377 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007378 ++s;
7379 }
7380 break;
7381 }
7382 }
7383 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007386 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007387 else
7388 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007389Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007390 if (x == 0xfffe)
7391 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 startinpos = s-starts;
7394 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007395 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 errors, &errorHandler,
7397 "charmap", "character maps to <undefined>",
7398 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007399 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 goto onError;
7401 }
7402 continue;
7403 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007404
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007405 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007406 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007407 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
7408 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007410 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007411 }
7412 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 while (s < e) {
7414 unsigned char ch = *s;
7415 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007416
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7418 w = PyLong_FromLong((long)ch);
7419 if (w == NULL)
7420 goto onError;
7421 x = PyObject_GetItem(mapping, w);
7422 Py_DECREF(w);
7423 if (x == NULL) {
7424 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7425 /* No mapping found means: mapping is undefined. */
7426 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007427 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 } else
7429 goto onError;
7430 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007431
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007433 if (x == Py_None)
7434 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 if (PyLong_Check(x)) {
7436 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007437 if (value == 0xFFFE)
7438 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007439 if (value < 0 || value > MAX_UNICODE) {
7440 PyErr_Format(PyExc_TypeError,
7441 "character mapping must be in range(0x%lx)",
7442 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 Py_DECREF(x);
7444 goto onError;
7445 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007446
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007447 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007448 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007449 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007450 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007451 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7452 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 else if (PyUnicode_Check(x)) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007455 if (PyUnicode_READY(x) == -1) {
7456 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007457 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007458 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007459 if (PyUnicode_GET_LENGTH(x) == 1) {
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007460 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007461 if (value == 0xFFFE)
7462 goto Undefined;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007463 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1) {
7464 Py_DECREF(x);
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007465 goto onError;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007466 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007467 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7468 writer.pos++;
7469 }
7470 else {
7471 writer.overallocate = 1;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007472 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007473 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007474 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007475 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007476 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 }
7478 else {
7479 /* wrong return value */
7480 PyErr_SetString(PyExc_TypeError,
7481 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007482 Py_DECREF(x);
7483 goto onError;
7484 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 Py_DECREF(x);
7486 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007487 continue;
7488Undefined:
7489 /* undefined mapping */
7490 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007491 startinpos = s-starts;
7492 endinpos = startinpos+1;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007493 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007494 errors, &errorHandler,
7495 "charmap", "character maps to <undefined>",
7496 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007497 &writer)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007498 goto onError;
7499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007502 Py_XDECREF(errorHandler);
7503 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007504 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007505
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007507 Py_XDECREF(errorHandler);
7508 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007509 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 return NULL;
7511}
7512
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007513/* Charmap encoding: the lookup table */
7514
Alexander Belopolsky40018472011-02-26 01:02:56 +00007515struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 PyObject_HEAD
7517 unsigned char level1[32];
7518 int count2, count3;
7519 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007520};
7521
7522static PyObject*
7523encoding_map_size(PyObject *obj, PyObject* args)
7524{
7525 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007526 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007528}
7529
7530static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007531 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 PyDoc_STR("Return the size (in bytes) of this object") },
7533 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007534};
7535
7536static void
7537encoding_map_dealloc(PyObject* o)
7538{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007539 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007540}
7541
7542static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007543 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 "EncodingMap", /*tp_name*/
7545 sizeof(struct encoding_map), /*tp_basicsize*/
7546 0, /*tp_itemsize*/
7547 /* methods */
7548 encoding_map_dealloc, /*tp_dealloc*/
7549 0, /*tp_print*/
7550 0, /*tp_getattr*/
7551 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007552 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 0, /*tp_repr*/
7554 0, /*tp_as_number*/
7555 0, /*tp_as_sequence*/
7556 0, /*tp_as_mapping*/
7557 0, /*tp_hash*/
7558 0, /*tp_call*/
7559 0, /*tp_str*/
7560 0, /*tp_getattro*/
7561 0, /*tp_setattro*/
7562 0, /*tp_as_buffer*/
7563 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7564 0, /*tp_doc*/
7565 0, /*tp_traverse*/
7566 0, /*tp_clear*/
7567 0, /*tp_richcompare*/
7568 0, /*tp_weaklistoffset*/
7569 0, /*tp_iter*/
7570 0, /*tp_iternext*/
7571 encoding_map_methods, /*tp_methods*/
7572 0, /*tp_members*/
7573 0, /*tp_getset*/
7574 0, /*tp_base*/
7575 0, /*tp_dict*/
7576 0, /*tp_descr_get*/
7577 0, /*tp_descr_set*/
7578 0, /*tp_dictoffset*/
7579 0, /*tp_init*/
7580 0, /*tp_alloc*/
7581 0, /*tp_new*/
7582 0, /*tp_free*/
7583 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007584};
7585
7586PyObject*
7587PyUnicode_BuildEncodingMap(PyObject* string)
7588{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007589 PyObject *result;
7590 struct encoding_map *mresult;
7591 int i;
7592 int need_dict = 0;
7593 unsigned char level1[32];
7594 unsigned char level2[512];
7595 unsigned char *mlevel1, *mlevel2, *mlevel3;
7596 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007597 int kind;
7598 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007599 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007600 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007601
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007602 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007603 PyErr_BadArgument();
7604 return NULL;
7605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007606 kind = PyUnicode_KIND(string);
7607 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007608 length = PyUnicode_GET_LENGTH(string);
7609 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007610 memset(level1, 0xFF, sizeof level1);
7611 memset(level2, 0xFF, sizeof level2);
7612
7613 /* If there isn't a one-to-one mapping of NULL to \0,
7614 or if there are non-BMP characters, we need to use
7615 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007616 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007617 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007618 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007620 ch = PyUnicode_READ(kind, data, i);
7621 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007622 need_dict = 1;
7623 break;
7624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007625 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007626 /* unmapped character */
7627 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007628 l1 = ch >> 11;
7629 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007630 if (level1[l1] == 0xFF)
7631 level1[l1] = count2++;
7632 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007633 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007634 }
7635
7636 if (count2 >= 0xFF || count3 >= 0xFF)
7637 need_dict = 1;
7638
7639 if (need_dict) {
7640 PyObject *result = PyDict_New();
7641 PyObject *key, *value;
7642 if (!result)
7643 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007644 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007645 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007646 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007647 if (!key || !value)
7648 goto failed1;
7649 if (PyDict_SetItem(result, key, value) == -1)
7650 goto failed1;
7651 Py_DECREF(key);
7652 Py_DECREF(value);
7653 }
7654 return result;
7655 failed1:
7656 Py_XDECREF(key);
7657 Py_XDECREF(value);
7658 Py_DECREF(result);
7659 return NULL;
7660 }
7661
7662 /* Create a three-level trie */
7663 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7664 16*count2 + 128*count3 - 1);
7665 if (!result)
7666 return PyErr_NoMemory();
7667 PyObject_Init(result, &EncodingMapType);
7668 mresult = (struct encoding_map*)result;
7669 mresult->count2 = count2;
7670 mresult->count3 = count3;
7671 mlevel1 = mresult->level1;
7672 mlevel2 = mresult->level23;
7673 mlevel3 = mresult->level23 + 16*count2;
7674 memcpy(mlevel1, level1, 32);
7675 memset(mlevel2, 0xFF, 16*count2);
7676 memset(mlevel3, 0, 128*count3);
7677 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007678 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007679 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007680 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7681 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007682 /* unmapped character */
7683 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007684 o1 = ch>>11;
7685 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007686 i2 = 16*mlevel1[o1] + o2;
7687 if (mlevel2[i2] == 0xFF)
7688 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007689 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007690 i3 = 128*mlevel2[i2] + o3;
7691 mlevel3[i3] = i;
7692 }
7693 return result;
7694}
7695
7696static int
Victor Stinner22168992011-11-20 17:09:18 +01007697encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007698{
7699 struct encoding_map *map = (struct encoding_map*)mapping;
7700 int l1 = c>>11;
7701 int l2 = (c>>7) & 0xF;
7702 int l3 = c & 0x7F;
7703 int i;
7704
Victor Stinner22168992011-11-20 17:09:18 +01007705 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007707 if (c == 0)
7708 return 0;
7709 /* level 1*/
7710 i = map->level1[l1];
7711 if (i == 0xFF) {
7712 return -1;
7713 }
7714 /* level 2*/
7715 i = map->level23[16*i+l2];
7716 if (i == 0xFF) {
7717 return -1;
7718 }
7719 /* level 3 */
7720 i = map->level23[16*map->count2 + 128*i + l3];
7721 if (i == 0) {
7722 return -1;
7723 }
7724 return i;
7725}
7726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007727/* Lookup the character ch in the mapping. If the character
7728 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007729 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007730static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007731charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732{
Christian Heimes217cfd12007-12-02 14:31:20 +00007733 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007734 PyObject *x;
7735
7736 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007738 x = PyObject_GetItem(mapping, w);
7739 Py_DECREF(w);
7740 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7742 /* No mapping found means: mapping is undefined. */
7743 PyErr_Clear();
7744 x = Py_None;
7745 Py_INCREF(x);
7746 return x;
7747 } else
7748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007750 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007752 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 long value = PyLong_AS_LONG(x);
7754 if (value < 0 || value > 255) {
7755 PyErr_SetString(PyExc_TypeError,
7756 "character mapping must be in range(256)");
7757 Py_DECREF(x);
7758 return NULL;
7759 }
7760 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007762 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 /* wrong return value */
7766 PyErr_Format(PyExc_TypeError,
7767 "character mapping must return integer, bytes or None, not %.400s",
7768 x->ob_type->tp_name);
7769 Py_DECREF(x);
7770 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 }
7772}
7773
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007775charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007776{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7778 /* exponentially overallocate to minimize reallocations */
7779 if (requiredsize < 2*outsize)
7780 requiredsize = 2*outsize;
7781 if (_PyBytes_Resize(outobj, requiredsize))
7782 return -1;
7783 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007784}
7785
Benjamin Peterson14339b62009-01-31 16:36:08 +00007786typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007788} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007790 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007791 space is available. Return a new reference to the object that
7792 was put in the output buffer, or Py_None, if the mapping was undefined
7793 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007794 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007795static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007796charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007797 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007798{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007799 PyObject *rep;
7800 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007801 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007802
Christian Heimes90aa7642007-12-19 02:45:37 +00007803 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007804 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007806 if (res == -1)
7807 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 if (outsize<requiredsize)
7809 if (charmapencode_resize(outobj, outpos, requiredsize))
7810 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007811 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 outstart[(*outpos)++] = (char)res;
7813 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007814 }
7815
7816 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007817 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007819 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 Py_DECREF(rep);
7821 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007822 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 if (PyLong_Check(rep)) {
7824 Py_ssize_t requiredsize = *outpos+1;
7825 if (outsize<requiredsize)
7826 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7827 Py_DECREF(rep);
7828 return enc_EXCEPTION;
7829 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007830 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007832 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 else {
7834 const char *repchars = PyBytes_AS_STRING(rep);
7835 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7836 Py_ssize_t requiredsize = *outpos+repsize;
7837 if (outsize<requiredsize)
7838 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7839 Py_DECREF(rep);
7840 return enc_EXCEPTION;
7841 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007842 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 memcpy(outstart + *outpos, repchars, repsize);
7844 *outpos += repsize;
7845 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007846 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007847 Py_DECREF(rep);
7848 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007849}
7850
7851/* handle an error in PyUnicode_EncodeCharmap
7852 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007853static int
7854charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007855 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007856 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007857 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007858 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007859{
7860 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007861 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007862 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007863 enum PyUnicode_Kind kind;
7864 void *data;
7865 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007867 Py_ssize_t collstartpos = *inpos;
7868 Py_ssize_t collendpos = *inpos+1;
7869 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870 char *encoding = "charmap";
7871 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007872 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007873 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007874 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875
Benjamin Petersonbac79492012-01-14 13:34:47 -05007876 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007877 return -1;
7878 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007879 /* find all unencodable characters */
7880 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007882 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007883 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007884 val = encoding_map_lookup(ch, mapping);
7885 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 break;
7887 ++collendpos;
7888 continue;
7889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007890
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007891 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7892 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 if (rep==NULL)
7894 return -1;
7895 else if (rep!=Py_None) {
7896 Py_DECREF(rep);
7897 break;
7898 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007899 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007901 }
7902 /* cache callback name lookup
7903 * (if not done yet, i.e. it's the first error) */
7904 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 if ((errors==NULL) || (!strcmp(errors, "strict")))
7906 *known_errorHandler = 1;
7907 else if (!strcmp(errors, "replace"))
7908 *known_errorHandler = 2;
7909 else if (!strcmp(errors, "ignore"))
7910 *known_errorHandler = 3;
7911 else if (!strcmp(errors, "xmlcharrefreplace"))
7912 *known_errorHandler = 4;
7913 else
7914 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007915 }
7916 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007917 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007918 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007919 return -1;
7920 case 2: /* replace */
7921 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 x = charmapencode_output('?', mapping, res, respos);
7923 if (x==enc_EXCEPTION) {
7924 return -1;
7925 }
7926 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007927 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 return -1;
7929 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007930 }
7931 /* fall through */
7932 case 3: /* ignore */
7933 *inpos = collendpos;
7934 break;
7935 case 4: /* xmlcharrefreplace */
7936 /* generate replacement (temporarily (mis)uses p) */
7937 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 char buffer[2+29+1+1];
7939 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007940 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 for (cp = buffer; *cp; ++cp) {
7942 x = charmapencode_output(*cp, mapping, res, respos);
7943 if (x==enc_EXCEPTION)
7944 return -1;
7945 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007946 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 return -1;
7948 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007949 }
7950 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007951 *inpos = collendpos;
7952 break;
7953 default:
7954 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007955 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007957 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007959 if (PyBytes_Check(repunicode)) {
7960 /* Directly copy bytes result to output. */
7961 Py_ssize_t outsize = PyBytes_Size(*res);
7962 Py_ssize_t requiredsize;
7963 repsize = PyBytes_Size(repunicode);
7964 requiredsize = *respos + repsize;
7965 if (requiredsize > outsize)
7966 /* Make room for all additional bytes. */
7967 if (charmapencode_resize(res, respos, requiredsize)) {
7968 Py_DECREF(repunicode);
7969 return -1;
7970 }
7971 memcpy(PyBytes_AsString(*res) + *respos,
7972 PyBytes_AsString(repunicode), repsize);
7973 *respos += repsize;
7974 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007975 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007976 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007978 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007979 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007980 Py_DECREF(repunicode);
7981 return -1;
7982 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007983 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007984 data = PyUnicode_DATA(repunicode);
7985 kind = PyUnicode_KIND(repunicode);
7986 for (index = 0; index < repsize; index++) {
7987 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7988 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007990 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 return -1;
7992 }
7993 else if (x==enc_FAILED) {
7994 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007995 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 return -1;
7997 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007998 }
7999 *inpos = newpos;
8000 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001 }
8002 return 0;
8003}
8004
Alexander Belopolsky40018472011-02-26 01:02:56 +00008005PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008006_PyUnicode_EncodeCharmap(PyObject *unicode,
8007 PyObject *mapping,
8008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008010 /* output object */
8011 PyObject *res = NULL;
8012 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008014 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008015 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008016 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017 PyObject *errorHandler = NULL;
8018 PyObject *exc = NULL;
8019 /* the following variable is used for caching string comparisons
8020 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8021 * 3=ignore, 4=xmlcharrefreplace */
8022 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008023 void *data;
8024 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
Benjamin Petersonbac79492012-01-14 13:34:47 -05008026 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008027 return NULL;
8028 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008029 data = PyUnicode_DATA(unicode);
8030 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008031
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 /* Default to Latin-1 */
8033 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008034 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008036 /* allocate enough for a simple encoding without
8037 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008038 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008039 if (res == NULL)
8040 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008041 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008045 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008047 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 if (x==enc_EXCEPTION) /* error */
8049 goto onError;
8050 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008051 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 &exc,
8053 &known_errorHandler, &errorHandler, errors,
8054 &res, &respos)) {
8055 goto onError;
8056 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 else
8059 /* done with this character => adjust input position */
8060 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008064 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008065 if (_PyBytes_Resize(&res, respos) < 0)
8066 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008068 Py_XDECREF(exc);
8069 Py_XDECREF(errorHandler);
8070 return res;
8071
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008073 Py_XDECREF(res);
8074 Py_XDECREF(exc);
8075 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 return NULL;
8077}
8078
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008079/* Deprecated */
8080PyObject *
8081PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8082 Py_ssize_t size,
8083 PyObject *mapping,
8084 const char *errors)
8085{
8086 PyObject *result;
8087 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8088 if (unicode == NULL)
8089 return NULL;
8090 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8091 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008092 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093}
8094
Alexander Belopolsky40018472011-02-26 01:02:56 +00008095PyObject *
8096PyUnicode_AsCharmapString(PyObject *unicode,
8097 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098{
8099 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 PyErr_BadArgument();
8101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104}
8105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008106/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008107static void
8108make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008110 Py_ssize_t startpos, Py_ssize_t endpos,
8111 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 *exceptionObject = _PyUnicodeTranslateError_Create(
8115 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 }
8117 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8119 goto onError;
8120 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8121 goto onError;
8122 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8123 goto onError;
8124 return;
8125 onError:
8126 Py_DECREF(*exceptionObject);
8127 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 }
8129}
8130
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131/* error handling callback helper:
8132 build arguments, call the callback and check the arguments,
8133 put the result into newpos and return the replacement string, which
8134 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008135static PyObject *
8136unicode_translate_call_errorhandler(const char *errors,
8137 PyObject **errorHandler,
8138 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008139 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008140 Py_ssize_t startpos, Py_ssize_t endpos,
8141 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008143 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008145 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 PyObject *restuple;
8147 PyObject *resunicode;
8148
8149 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153 }
8154
8155 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008157 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159
8160 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008164 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008165 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 Py_DECREF(restuple);
8167 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008168 }
8169 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 &resunicode, &i_newpos)) {
8171 Py_DECREF(restuple);
8172 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008173 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008174 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008176 else
8177 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008178 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8180 Py_DECREF(restuple);
8181 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008182 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008183 Py_INCREF(resunicode);
8184 Py_DECREF(restuple);
8185 return resunicode;
8186}
8187
8188/* Lookup the character ch in the mapping and put the result in result,
8189 which must be decrefed by the caller.
8190 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008191static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193{
Christian Heimes217cfd12007-12-02 14:31:20 +00008194 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008195 PyObject *x;
8196
8197 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 x = PyObject_GetItem(mapping, w);
8200 Py_DECREF(w);
8201 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8203 /* No mapping found means: use 1:1 mapping. */
8204 PyErr_Clear();
8205 *result = NULL;
8206 return 0;
8207 } else
8208 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 }
8210 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 *result = x;
8212 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008214 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 long value = PyLong_AS_LONG(x);
8216 long max = PyUnicode_GetMax();
8217 if (value < 0 || value > max) {
8218 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008219 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 Py_DECREF(x);
8221 return -1;
8222 }
8223 *result = x;
8224 return 0;
8225 }
8226 else if (PyUnicode_Check(x)) {
8227 *result = x;
8228 return 0;
8229 }
8230 else {
8231 /* wrong return value */
8232 PyErr_SetString(PyExc_TypeError,
8233 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008234 Py_DECREF(x);
8235 return -1;
8236 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008237}
8238/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 if not reallocate and adjust various state variables.
8240 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008241static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008245 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008246 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008247 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 /* exponentially overallocate to minimize reallocations */
8249 if (requiredsize < 2 * oldsize)
8250 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008251 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8252 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008254 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008255 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256 }
8257 return 0;
8258}
8259/* lookup the character, put the result in the output string and adjust
8260 various state variables. Return a new reference to the object that
8261 was put in the output buffer in *result, or Py_None, if the mapping was
8262 undefined (in which case no character was written).
8263 The called must decref result.
8264 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008265static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8267 PyObject *mapping, Py_UCS4 **output,
8268 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008269 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8272 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008276 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 }
8278 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008280 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008282 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283 }
8284 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008285 Py_ssize_t repsize;
8286 if (PyUnicode_READY(*res) == -1)
8287 return -1;
8288 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 if (repsize==1) {
8290 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008291 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 }
8293 else if (repsize!=0) {
8294 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 Py_ssize_t requiredsize = *opos +
8296 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298 Py_ssize_t i;
8299 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008301 for(i = 0; i < repsize; i++)
8302 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 }
8305 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 return 0;
8308}
8309
Alexander Belopolsky40018472011-02-26 01:02:56 +00008310PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008311_PyUnicode_TranslateCharmap(PyObject *input,
8312 PyObject *mapping,
8313 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 /* input object */
8316 char *idata;
8317 Py_ssize_t size, i;
8318 int kind;
8319 /* output buffer */
8320 Py_UCS4 *output = NULL;
8321 Py_ssize_t osize;
8322 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008325 char *reason = "character maps to <undefined>";
8326 PyObject *errorHandler = NULL;
8327 PyObject *exc = NULL;
8328 /* the following variable is used for caching string comparisons
8329 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8330 * 3=ignore, 4=xmlcharrefreplace */
8331 int known_errorHandler = -1;
8332
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 PyErr_BadArgument();
8335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338 if (PyUnicode_READY(input) == -1)
8339 return NULL;
8340 idata = (char*)PyUnicode_DATA(input);
8341 kind = PyUnicode_KIND(input);
8342 size = PyUnicode_GET_LENGTH(input);
8343 i = 0;
8344
8345 if (size == 0) {
8346 Py_INCREF(input);
8347 return input;
8348 }
8349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 /* allocate enough for a simple 1:1 translation without
8351 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 osize = size;
8353 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8354 opos = 0;
8355 if (output == NULL) {
8356 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 /* try to encode it */
8362 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008363 if (charmaptranslate_output(input, i, mapping,
8364 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 Py_XDECREF(x);
8366 goto onError;
8367 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008368 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 else { /* untranslatable character */
8372 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8373 Py_ssize_t repsize;
8374 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 Py_ssize_t collstart = i;
8378 Py_ssize_t collend = i+1;
8379 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 while (collend < size) {
8383 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 goto onError;
8385 Py_XDECREF(x);
8386 if (x!=Py_None)
8387 break;
8388 ++collend;
8389 }
8390 /* cache callback name lookup
8391 * (if not done yet, i.e. it's the first error) */
8392 if (known_errorHandler==-1) {
8393 if ((errors==NULL) || (!strcmp(errors, "strict")))
8394 known_errorHandler = 1;
8395 else if (!strcmp(errors, "replace"))
8396 known_errorHandler = 2;
8397 else if (!strcmp(errors, "ignore"))
8398 known_errorHandler = 3;
8399 else if (!strcmp(errors, "xmlcharrefreplace"))
8400 known_errorHandler = 4;
8401 else
8402 known_errorHandler = 0;
8403 }
8404 switch (known_errorHandler) {
8405 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008406 make_translate_exception(&exc,
8407 input, collstart, collend, reason);
8408 if (exc != NULL)
8409 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 case 2: /* replace */
8412 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 for (coll = collstart; coll<collend; coll++)
8414 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 /* fall through */
8416 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 break;
8419 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420 /* generate replacement (temporarily (mis)uses i) */
8421 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 char buffer[2+29+1+1];
8423 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8425 if (charmaptranslate_makespace(&output, &osize,
8426 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 goto onError;
8428 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 break;
8433 default:
8434 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 reason, input, &exc,
8436 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008437 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008439 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008440 Py_DECREF(repunicode);
8441 goto onError;
8442 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 repsize = PyUnicode_GET_LENGTH(repunicode);
8445 if (charmaptranslate_makespace(&output, &osize,
8446 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 Py_DECREF(repunicode);
8448 goto onError;
8449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 for (uni2 = 0; repsize-->0; ++uni2)
8451 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8452 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 }
8456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8458 if (!res)
8459 goto onError;
8460 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 Py_XDECREF(exc);
8462 Py_XDECREF(errorHandler);
8463 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008467 Py_XDECREF(exc);
8468 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469 return NULL;
8470}
8471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472/* Deprecated. Use PyUnicode_Translate instead. */
8473PyObject *
8474PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8475 Py_ssize_t size,
8476 PyObject *mapping,
8477 const char *errors)
8478{
Christian Heimes5f520f42012-09-11 14:03:25 +02008479 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8481 if (!unicode)
8482 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008483 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8484 Py_DECREF(unicode);
8485 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486}
8487
Alexander Belopolsky40018472011-02-26 01:02:56 +00008488PyObject *
8489PyUnicode_Translate(PyObject *str,
8490 PyObject *mapping,
8491 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492{
8493 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008494
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 str = PyUnicode_FromObject(str);
8496 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008497 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 Py_DECREF(str);
8500 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501}
Tim Petersced69f82003-09-16 20:30:58 +00008502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008504fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505{
8506 /* No need to call PyUnicode_READY(self) because this function is only
8507 called as a callback from fixup() which does it already. */
8508 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8509 const int kind = PyUnicode_KIND(self);
8510 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008511 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008512 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 Py_ssize_t i;
8514
8515 for (i = 0; i < len; ++i) {
8516 ch = PyUnicode_READ(kind, data, i);
8517 fixed = 0;
8518 if (ch > 127) {
8519 if (Py_UNICODE_ISSPACE(ch))
8520 fixed = ' ';
8521 else {
8522 const int decimal = Py_UNICODE_TODECIMAL(ch);
8523 if (decimal >= 0)
8524 fixed = '0' + decimal;
8525 }
8526 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008527 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008528 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 PyUnicode_WRITE(kind, data, i, fixed);
8530 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008531 else
8532 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 }
8535
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008536 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537}
8538
8539PyObject *
8540_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8541{
8542 if (!PyUnicode_Check(unicode)) {
8543 PyErr_BadInternalCall();
8544 return NULL;
8545 }
8546 if (PyUnicode_READY(unicode) == -1)
8547 return NULL;
8548 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8549 /* If the string is already ASCII, just return the same string */
8550 Py_INCREF(unicode);
8551 return unicode;
8552 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008553 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554}
8555
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008556PyObject *
8557PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8558 Py_ssize_t length)
8559{
Victor Stinnerf0124502011-11-21 23:12:56 +01008560 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008561 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008562 Py_UCS4 maxchar;
8563 enum PyUnicode_Kind kind;
8564 void *data;
8565
Victor Stinner99d7ad02012-02-22 13:37:39 +01008566 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008567 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008568 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008569 if (ch > 127) {
8570 int decimal = Py_UNICODE_TODECIMAL(ch);
8571 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008572 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008573 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008574 }
8575 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008576
8577 /* Copy to a new string */
8578 decimal = PyUnicode_New(length, maxchar);
8579 if (decimal == NULL)
8580 return decimal;
8581 kind = PyUnicode_KIND(decimal);
8582 data = PyUnicode_DATA(decimal);
8583 /* Iterate over code points */
8584 for (i = 0; i < length; i++) {
8585 Py_UNICODE ch = s[i];
8586 if (ch > 127) {
8587 int decimal = Py_UNICODE_TODECIMAL(ch);
8588 if (decimal >= 0)
8589 ch = '0' + decimal;
8590 }
8591 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008593 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008594}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008595/* --- Decimal Encoder ---------------------------------------------------- */
8596
Alexander Belopolsky40018472011-02-26 01:02:56 +00008597int
8598PyUnicode_EncodeDecimal(Py_UNICODE *s,
8599 Py_ssize_t length,
8600 char *output,
8601 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008602{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008603 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008604 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008605 enum PyUnicode_Kind kind;
8606 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008607
8608 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 PyErr_BadArgument();
8610 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008611 }
8612
Victor Stinner42bf7752011-11-21 22:52:58 +01008613 unicode = PyUnicode_FromUnicode(s, length);
8614 if (unicode == NULL)
8615 return -1;
8616
Benjamin Petersonbac79492012-01-14 13:34:47 -05008617 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008618 Py_DECREF(unicode);
8619 return -1;
8620 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008621 kind = PyUnicode_KIND(unicode);
8622 data = PyUnicode_DATA(unicode);
8623
Victor Stinnerb84d7232011-11-22 01:50:07 +01008624 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008625 PyObject *exc;
8626 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008628 Py_ssize_t startpos;
8629
8630 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008631
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008633 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008634 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008636 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 decimal = Py_UNICODE_TODECIMAL(ch);
8638 if (decimal >= 0) {
8639 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008640 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 continue;
8642 }
8643 if (0 < ch && ch < 256) {
8644 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008645 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 continue;
8647 }
Victor Stinner6345be92011-11-25 20:09:01 +01008648
Victor Stinner42bf7752011-11-21 22:52:58 +01008649 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008650 exc = NULL;
8651 raise_encode_exception(&exc, "decimal", unicode,
8652 startpos, startpos+1,
8653 "invalid decimal Unicode string");
8654 Py_XDECREF(exc);
8655 Py_DECREF(unicode);
8656 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008657 }
8658 /* 0-terminate the output string */
8659 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008660 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008661 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008662}
8663
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664/* --- Helpers ------------------------------------------------------------ */
8665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008667any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 Py_ssize_t start,
8669 Py_ssize_t end)
8670{
8671 int kind1, kind2, kind;
8672 void *buf1, *buf2;
8673 Py_ssize_t len1, len2, result;
8674
8675 kind1 = PyUnicode_KIND(s1);
8676 kind2 = PyUnicode_KIND(s2);
8677 kind = kind1 > kind2 ? kind1 : kind2;
8678 buf1 = PyUnicode_DATA(s1);
8679 buf2 = PyUnicode_DATA(s2);
8680 if (kind1 != kind)
8681 buf1 = _PyUnicode_AsKind(s1, kind);
8682 if (!buf1)
8683 return -2;
8684 if (kind2 != kind)
8685 buf2 = _PyUnicode_AsKind(s2, kind);
8686 if (!buf2) {
8687 if (kind1 != kind) PyMem_Free(buf1);
8688 return -2;
8689 }
8690 len1 = PyUnicode_GET_LENGTH(s1);
8691 len2 = PyUnicode_GET_LENGTH(s2);
8692
Victor Stinner794d5672011-10-10 03:21:36 +02008693 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008694 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008695 case PyUnicode_1BYTE_KIND:
8696 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8697 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8698 else
8699 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8700 break;
8701 case PyUnicode_2BYTE_KIND:
8702 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8703 break;
8704 case PyUnicode_4BYTE_KIND:
8705 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8706 break;
8707 default:
8708 assert(0); result = -2;
8709 }
8710 }
8711 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008712 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008713 case PyUnicode_1BYTE_KIND:
8714 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8715 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8716 else
8717 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8718 break;
8719 case PyUnicode_2BYTE_KIND:
8720 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8721 break;
8722 case PyUnicode_4BYTE_KIND:
8723 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8724 break;
8725 default:
8726 assert(0); result = -2;
8727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 }
8729
8730 if (kind1 != kind)
8731 PyMem_Free(buf1);
8732 if (kind2 != kind)
8733 PyMem_Free(buf2);
8734
8735 return result;
8736}
8737
8738Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008739_PyUnicode_InsertThousandsGrouping(
8740 PyObject *unicode, Py_ssize_t index,
8741 Py_ssize_t n_buffer,
8742 void *digits, Py_ssize_t n_digits,
8743 Py_ssize_t min_width,
8744 const char *grouping, PyObject *thousands_sep,
8745 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746{
Victor Stinner41a863c2012-02-24 00:37:51 +01008747 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008748 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008749 Py_ssize_t thousands_sep_len;
8750 Py_ssize_t len;
8751
8752 if (unicode != NULL) {
8753 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008754 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008755 }
8756 else {
8757 kind = PyUnicode_1BYTE_KIND;
8758 data = NULL;
8759 }
8760 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8761 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8762 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8763 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008764 if (thousands_sep_kind < kind) {
8765 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8766 if (!thousands_sep_data)
8767 return -1;
8768 }
8769 else {
8770 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8771 if (!data)
8772 return -1;
8773 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008774 }
8775
Benjamin Petersonead6b532011-12-20 17:23:42 -06008776 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008778 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008779 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008780 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008781 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008782 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008783 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008784 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008785 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008786 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008787 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008788 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008790 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008791 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008792 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008793 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008794 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008796 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008797 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008798 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008799 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008800 break;
8801 default:
8802 assert(0);
8803 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008805 if (unicode != NULL && thousands_sep_kind != kind) {
8806 if (thousands_sep_kind < kind)
8807 PyMem_Free(thousands_sep_data);
8808 else
8809 PyMem_Free(data);
8810 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008811 if (unicode == NULL) {
8812 *maxchar = 127;
8813 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008814 *maxchar = MAX_MAXCHAR(*maxchar,
8815 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008816 }
8817 }
8818 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819}
8820
8821
Thomas Wouters477c8d52006-05-27 19:21:47 +00008822/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008823#define ADJUST_INDICES(start, end, len) \
8824 if (end > len) \
8825 end = len; \
8826 else if (end < 0) { \
8827 end += len; \
8828 if (end < 0) \
8829 end = 0; \
8830 } \
8831 if (start < 0) { \
8832 start += len; \
8833 if (start < 0) \
8834 start = 0; \
8835 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008836
Alexander Belopolsky40018472011-02-26 01:02:56 +00008837Py_ssize_t
8838PyUnicode_Count(PyObject *str,
8839 PyObject *substr,
8840 Py_ssize_t start,
8841 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008843 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008844 PyObject* str_obj;
8845 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 int kind1, kind2, kind;
8847 void *buf1 = NULL, *buf2 = NULL;
8848 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008849
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008850 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008851 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008853 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008854 if (!sub_obj) {
8855 Py_DECREF(str_obj);
8856 return -1;
8857 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008858 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008859 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 Py_DECREF(str_obj);
8861 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 }
Tim Petersced69f82003-09-16 20:30:58 +00008863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 kind1 = PyUnicode_KIND(str_obj);
8865 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008866 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008869 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008870 if (kind2 > kind) {
8871 Py_DECREF(sub_obj);
8872 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008873 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008874 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008875 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 if (!buf2)
8878 goto onError;
8879 len1 = PyUnicode_GET_LENGTH(str_obj);
8880 len2 = PyUnicode_GET_LENGTH(sub_obj);
8881
8882 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008883 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008885 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8886 result = asciilib_count(
8887 ((Py_UCS1*)buf1) + start, end - start,
8888 buf2, len2, PY_SSIZE_T_MAX
8889 );
8890 else
8891 result = ucs1lib_count(
8892 ((Py_UCS1*)buf1) + start, end - start,
8893 buf2, len2, PY_SSIZE_T_MAX
8894 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 break;
8896 case PyUnicode_2BYTE_KIND:
8897 result = ucs2lib_count(
8898 ((Py_UCS2*)buf1) + start, end - start,
8899 buf2, len2, PY_SSIZE_T_MAX
8900 );
8901 break;
8902 case PyUnicode_4BYTE_KIND:
8903 result = ucs4lib_count(
8904 ((Py_UCS4*)buf1) + start, end - start,
8905 buf2, len2, PY_SSIZE_T_MAX
8906 );
8907 break;
8908 default:
8909 assert(0); result = 0;
8910 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008911
8912 Py_DECREF(sub_obj);
8913 Py_DECREF(str_obj);
8914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 if (kind2 != kind)
8916 PyMem_Free(buf2);
8917
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 onError:
8920 Py_DECREF(sub_obj);
8921 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 if (kind2 != kind && buf2)
8923 PyMem_Free(buf2);
8924 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925}
8926
Alexander Belopolsky40018472011-02-26 01:02:56 +00008927Py_ssize_t
8928PyUnicode_Find(PyObject *str,
8929 PyObject *sub,
8930 Py_ssize_t start,
8931 Py_ssize_t end,
8932 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008934 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008935
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008937 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008939 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008940 if (!sub) {
8941 Py_DECREF(str);
8942 return -2;
8943 }
8944 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8945 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 Py_DECREF(str);
8947 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948 }
Tim Petersced69f82003-09-16 20:30:58 +00008949
Victor Stinner794d5672011-10-10 03:21:36 +02008950 result = any_find_slice(direction,
8951 str, sub, start, end
8952 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008953
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008955 Py_DECREF(sub);
8956
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 return result;
8958}
8959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960Py_ssize_t
8961PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8962 Py_ssize_t start, Py_ssize_t end,
8963 int direction)
8964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008966 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 if (PyUnicode_READY(str) == -1)
8968 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008969 if (start < 0 || end < 0) {
8970 PyErr_SetString(PyExc_IndexError, "string index out of range");
8971 return -2;
8972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 if (end > PyUnicode_GET_LENGTH(str))
8974 end = PyUnicode_GET_LENGTH(str);
8975 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008976 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8977 kind, end-start, ch, direction);
8978 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008980 else
8981 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982}
8983
Alexander Belopolsky40018472011-02-26 01:02:56 +00008984static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008985tailmatch(PyObject *self,
8986 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008987 Py_ssize_t start,
8988 Py_ssize_t end,
8989 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 int kind_self;
8992 int kind_sub;
8993 void *data_self;
8994 void *data_sub;
8995 Py_ssize_t offset;
8996 Py_ssize_t i;
8997 Py_ssize_t end_sub;
8998
8999 if (PyUnicode_READY(self) == -1 ||
9000 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009001 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002
9003 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 return 1;
9005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9007 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 kind_self = PyUnicode_KIND(self);
9012 data_self = PyUnicode_DATA(self);
9013 kind_sub = PyUnicode_KIND(substring);
9014 data_sub = PyUnicode_DATA(substring);
9015 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9016
9017 if (direction > 0)
9018 offset = end;
9019 else
9020 offset = start;
9021
9022 if (PyUnicode_READ(kind_self, data_self, offset) ==
9023 PyUnicode_READ(kind_sub, data_sub, 0) &&
9024 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9025 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9026 /* If both are of the same kind, memcmp is sufficient */
9027 if (kind_self == kind_sub) {
9028 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009029 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 data_sub,
9031 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009032 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033 }
9034 /* otherwise we have to compare each character by first accesing it */
9035 else {
9036 /* We do not need to compare 0 and len(substring)-1 because
9037 the if statement above ensured already that they are equal
9038 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 for (i = 1; i < end_sub; ++i) {
9040 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9041 PyUnicode_READ(kind_sub, data_sub, i))
9042 return 0;
9043 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 }
9047
9048 return 0;
9049}
9050
Alexander Belopolsky40018472011-02-26 01:02:56 +00009051Py_ssize_t
9052PyUnicode_Tailmatch(PyObject *str,
9053 PyObject *substr,
9054 Py_ssize_t start,
9055 Py_ssize_t end,
9056 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009058 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009059
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060 str = PyUnicode_FromObject(str);
9061 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009062 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063 substr = PyUnicode_FromObject(substr);
9064 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009065 Py_DECREF(str);
9066 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067 }
Tim Petersced69f82003-09-16 20:30:58 +00009068
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009069 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071 Py_DECREF(str);
9072 Py_DECREF(substr);
9073 return result;
9074}
9075
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076/* Apply fixfct filter to the Unicode object self and return a
9077 reference to the modified object */
9078
Alexander Belopolsky40018472011-02-26 01:02:56 +00009079static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009080fixup(PyObject *self,
9081 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 PyObject *u;
9084 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009085 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009087 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009090 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 /* fix functions return the new maximum character in a string,
9093 if the kind of the resulting unicode object does not change,
9094 everything is fine. Otherwise we need to change the string kind
9095 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009096 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009097
9098 if (maxchar_new == 0) {
9099 /* no changes */;
9100 if (PyUnicode_CheckExact(self)) {
9101 Py_DECREF(u);
9102 Py_INCREF(self);
9103 return self;
9104 }
9105 else
9106 return u;
9107 }
9108
Victor Stinnere6abb482012-05-02 01:15:40 +02009109 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110
Victor Stinnereaab6042011-12-11 22:22:39 +01009111 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009113
9114 /* In case the maximum character changed, we need to
9115 convert the string to the new category. */
9116 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9117 if (v == NULL) {
9118 Py_DECREF(u);
9119 return NULL;
9120 }
9121 if (maxchar_new > maxchar_old) {
9122 /* If the maxchar increased so that the kind changed, not all
9123 characters are representable anymore and we need to fix the
9124 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009125 _PyUnicode_FastCopyCharacters(v, 0,
9126 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009127 maxchar_old = fixfct(v);
9128 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129 }
9130 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009131 _PyUnicode_FastCopyCharacters(v, 0,
9132 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009134 Py_DECREF(u);
9135 assert(_PyUnicode_CheckConsistency(v, 1));
9136 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137}
9138
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009139static PyObject *
9140ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009142 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9143 char *resdata, *data = PyUnicode_DATA(self);
9144 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009145
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009146 res = PyUnicode_New(len, 127);
9147 if (res == NULL)
9148 return NULL;
9149 resdata = PyUnicode_DATA(res);
9150 if (lower)
9151 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009153 _Py_bytes_upper(resdata, data, len);
9154 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155}
9156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009158handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009160 Py_ssize_t j;
9161 int final_sigma;
9162 Py_UCS4 c;
9163 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009164
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009165 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9166
9167 where ! is a negation and \p{xxx} is a character with property xxx.
9168 */
9169 for (j = i - 1; j >= 0; j--) {
9170 c = PyUnicode_READ(kind, data, j);
9171 if (!_PyUnicode_IsCaseIgnorable(c))
9172 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009174 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9175 if (final_sigma) {
9176 for (j = i + 1; j < length; j++) {
9177 c = PyUnicode_READ(kind, data, j);
9178 if (!_PyUnicode_IsCaseIgnorable(c))
9179 break;
9180 }
9181 final_sigma = j == length || !_PyUnicode_IsCased(c);
9182 }
9183 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184}
9185
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009186static int
9187lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9188 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009190 /* Obscure special case. */
9191 if (c == 0x3A3) {
9192 mapped[0] = handle_capital_sigma(kind, data, length, i);
9193 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009195 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196}
9197
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009198static Py_ssize_t
9199do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009201 Py_ssize_t i, k = 0;
9202 int n_res, j;
9203 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009204
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009205 c = PyUnicode_READ(kind, data, 0);
9206 n_res = _PyUnicode_ToUpperFull(c, mapped);
9207 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009208 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009209 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009211 for (i = 1; i < length; i++) {
9212 c = PyUnicode_READ(kind, data, i);
9213 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9214 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009215 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009216 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009217 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009218 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009219 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220}
9221
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009222static Py_ssize_t
9223do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9224 Py_ssize_t i, k = 0;
9225
9226 for (i = 0; i < length; i++) {
9227 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9228 int n_res, j;
9229 if (Py_UNICODE_ISUPPER(c)) {
9230 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9231 }
9232 else if (Py_UNICODE_ISLOWER(c)) {
9233 n_res = _PyUnicode_ToUpperFull(c, mapped);
9234 }
9235 else {
9236 n_res = 1;
9237 mapped[0] = c;
9238 }
9239 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009240 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009241 res[k++] = mapped[j];
9242 }
9243 }
9244 return k;
9245}
9246
9247static Py_ssize_t
9248do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9249 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009251 Py_ssize_t i, k = 0;
9252
9253 for (i = 0; i < length; i++) {
9254 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9255 int n_res, j;
9256 if (lower)
9257 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9258 else
9259 n_res = _PyUnicode_ToUpperFull(c, mapped);
9260 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009261 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009262 res[k++] = mapped[j];
9263 }
9264 }
9265 return k;
9266}
9267
9268static Py_ssize_t
9269do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9270{
9271 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9272}
9273
9274static Py_ssize_t
9275do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9276{
9277 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9278}
9279
Benjamin Petersone51757f2012-01-12 21:10:29 -05009280static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009281do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9282{
9283 Py_ssize_t i, k = 0;
9284
9285 for (i = 0; i < length; i++) {
9286 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9287 Py_UCS4 mapped[3];
9288 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9289 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009290 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009291 res[k++] = mapped[j];
9292 }
9293 }
9294 return k;
9295}
9296
9297static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009298do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9299{
9300 Py_ssize_t i, k = 0;
9301 int previous_is_cased;
9302
9303 previous_is_cased = 0;
9304 for (i = 0; i < length; i++) {
9305 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9306 Py_UCS4 mapped[3];
9307 int n_res, j;
9308
9309 if (previous_is_cased)
9310 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9311 else
9312 n_res = _PyUnicode_ToTitleFull(c, mapped);
9313
9314 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009315 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009316 res[k++] = mapped[j];
9317 }
9318
9319 previous_is_cased = _PyUnicode_IsCased(c);
9320 }
9321 return k;
9322}
9323
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009324static PyObject *
9325case_operation(PyObject *self,
9326 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9327{
9328 PyObject *res = NULL;
9329 Py_ssize_t length, newlength = 0;
9330 int kind, outkind;
9331 void *data, *outdata;
9332 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9333
Benjamin Petersoneea48462012-01-16 14:28:50 -05009334 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009335
9336 kind = PyUnicode_KIND(self);
9337 data = PyUnicode_DATA(self);
9338 length = PyUnicode_GET_LENGTH(self);
9339 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9340 if (tmp == NULL)
9341 return PyErr_NoMemory();
9342 newlength = perform(kind, data, length, tmp, &maxchar);
9343 res = PyUnicode_New(newlength, maxchar);
9344 if (res == NULL)
9345 goto leave;
9346 tmpend = tmp + newlength;
9347 outdata = PyUnicode_DATA(res);
9348 outkind = PyUnicode_KIND(res);
9349 switch (outkind) {
9350 case PyUnicode_1BYTE_KIND:
9351 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9352 break;
9353 case PyUnicode_2BYTE_KIND:
9354 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9355 break;
9356 case PyUnicode_4BYTE_KIND:
9357 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9358 break;
9359 default:
9360 assert(0);
9361 break;
9362 }
9363 leave:
9364 PyMem_FREE(tmp);
9365 return res;
9366}
9367
Tim Peters8ce9f162004-08-27 01:49:32 +00009368PyObject *
9369PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009372 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009374 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009375 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9376 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009377 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009379 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009381 int use_memcpy;
9382 unsigned char *res_data = NULL, *sep_data = NULL;
9383 PyObject *last_obj;
9384 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385
Tim Peters05eba1f2004-08-27 21:32:02 +00009386 fseq = PySequence_Fast(seq, "");
9387 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009388 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009389 }
9390
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009391 /* NOTE: the following code can't call back into Python code,
9392 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009393 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009394
Tim Peters05eba1f2004-08-27 21:32:02 +00009395 seqlen = PySequence_Fast_GET_SIZE(fseq);
9396 /* If empty sequence, return u"". */
9397 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009398 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009399 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009400 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009401
Tim Peters05eba1f2004-08-27 21:32:02 +00009402 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009403 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009404 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009405 if (seqlen == 1) {
9406 if (PyUnicode_CheckExact(items[0])) {
9407 res = items[0];
9408 Py_INCREF(res);
9409 Py_DECREF(fseq);
9410 return res;
9411 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009412 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009413 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009414 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009415 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009416 /* Set up sep and seplen */
9417 if (separator == NULL) {
9418 /* fall back to a blank space separator */
9419 sep = PyUnicode_FromOrdinal(' ');
9420 if (!sep)
9421 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009422 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009423 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009424 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009425 else {
9426 if (!PyUnicode_Check(separator)) {
9427 PyErr_Format(PyExc_TypeError,
9428 "separator: expected str instance,"
9429 " %.80s found",
9430 Py_TYPE(separator)->tp_name);
9431 goto onError;
9432 }
9433 if (PyUnicode_READY(separator))
9434 goto onError;
9435 sep = separator;
9436 seplen = PyUnicode_GET_LENGTH(separator);
9437 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9438 /* inc refcount to keep this code path symmetric with the
9439 above case of a blank separator */
9440 Py_INCREF(sep);
9441 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009442 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009443 }
9444
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009445 /* There are at least two things to join, or else we have a subclass
9446 * of str in the sequence.
9447 * Do a pre-pass to figure out the total amount of space we'll
9448 * need (sz), and see whether all argument are strings.
9449 */
9450 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009451#ifdef Py_DEBUG
9452 use_memcpy = 0;
9453#else
9454 use_memcpy = 1;
9455#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009456 for (i = 0; i < seqlen; i++) {
9457 const Py_ssize_t old_sz = sz;
9458 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009459 if (!PyUnicode_Check(item)) {
9460 PyErr_Format(PyExc_TypeError,
9461 "sequence item %zd: expected str instance,"
9462 " %.80s found",
9463 i, Py_TYPE(item)->tp_name);
9464 goto onError;
9465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 if (PyUnicode_READY(item) == -1)
9467 goto onError;
9468 sz += PyUnicode_GET_LENGTH(item);
9469 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009470 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009471 if (i != 0)
9472 sz += seplen;
9473 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9474 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009476 goto onError;
9477 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009478 if (use_memcpy && last_obj != NULL) {
9479 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9480 use_memcpy = 0;
9481 }
9482 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009483 }
Tim Petersced69f82003-09-16 20:30:58 +00009484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009486 if (res == NULL)
9487 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009488
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009489 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009490#ifdef Py_DEBUG
9491 use_memcpy = 0;
9492#else
9493 if (use_memcpy) {
9494 res_data = PyUnicode_1BYTE_DATA(res);
9495 kind = PyUnicode_KIND(res);
9496 if (seplen != 0)
9497 sep_data = PyUnicode_1BYTE_DATA(sep);
9498 }
9499#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009501 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009502 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009504 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009505 if (use_memcpy) {
9506 Py_MEMCPY(res_data,
9507 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009508 kind * seplen);
9509 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009510 }
9511 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009512 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009513 res_offset += seplen;
9514 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009515 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009516 itemlen = PyUnicode_GET_LENGTH(item);
9517 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009518 if (use_memcpy) {
9519 Py_MEMCPY(res_data,
9520 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009521 kind * itemlen);
9522 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009523 }
9524 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009525 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009526 res_offset += itemlen;
9527 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009528 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009529 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009530 if (use_memcpy)
9531 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009532 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009533 else
9534 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009535
Tim Peters05eba1f2004-08-27 21:32:02 +00009536 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009538 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009542 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009544 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545 return NULL;
9546}
9547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548#define FILL(kind, data, value, start, length) \
9549 do { \
9550 Py_ssize_t i_ = 0; \
9551 assert(kind != PyUnicode_WCHAR_KIND); \
9552 switch ((kind)) { \
9553 case PyUnicode_1BYTE_KIND: { \
9554 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009555 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 break; \
9557 } \
9558 case PyUnicode_2BYTE_KIND: { \
9559 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9560 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9561 break; \
9562 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009563 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9565 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9566 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009567 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 } \
9569 } \
9570 } while (0)
9571
Victor Stinnerd3f08822012-05-29 12:57:52 +02009572void
9573_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9574 Py_UCS4 fill_char)
9575{
9576 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9577 const void *data = PyUnicode_DATA(unicode);
9578 assert(PyUnicode_IS_READY(unicode));
9579 assert(unicode_modifiable(unicode));
9580 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9581 assert(start >= 0);
9582 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9583 FILL(kind, data, fill_char, start, length);
9584}
9585
Victor Stinner3fe55312012-01-04 00:33:50 +01009586Py_ssize_t
9587PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9588 Py_UCS4 fill_char)
9589{
9590 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009591
9592 if (!PyUnicode_Check(unicode)) {
9593 PyErr_BadInternalCall();
9594 return -1;
9595 }
9596 if (PyUnicode_READY(unicode) == -1)
9597 return -1;
9598 if (unicode_check_modifiable(unicode))
9599 return -1;
9600
Victor Stinnerd3f08822012-05-29 12:57:52 +02009601 if (start < 0) {
9602 PyErr_SetString(PyExc_IndexError, "string index out of range");
9603 return -1;
9604 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009605 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9606 PyErr_SetString(PyExc_ValueError,
9607 "fill character is bigger than "
9608 "the string maximum character");
9609 return -1;
9610 }
9611
9612 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9613 length = Py_MIN(maxlen, length);
9614 if (length <= 0)
9615 return 0;
9616
Victor Stinnerd3f08822012-05-29 12:57:52 +02009617 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009618 return length;
9619}
9620
Victor Stinner9310abb2011-10-05 00:59:23 +02009621static PyObject *
9622pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009623 Py_ssize_t left,
9624 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 PyObject *u;
9628 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009629 int kind;
9630 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631
9632 if (left < 0)
9633 left = 0;
9634 if (right < 0)
9635 right = 0;
9636
Victor Stinnerc4b49542011-12-11 22:44:26 +01009637 if (left == 0 && right == 0)
9638 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9641 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009642 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9643 return NULL;
9644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009646 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009648 if (!u)
9649 return NULL;
9650
9651 kind = PyUnicode_KIND(u);
9652 data = PyUnicode_DATA(u);
9653 if (left)
9654 FILL(kind, data, fill, 0, left);
9655 if (right)
9656 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009657 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009658 assert(_PyUnicode_CheckConsistency(u, 1));
9659 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660}
9661
Alexander Belopolsky40018472011-02-26 01:02:56 +00009662PyObject *
9663PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666
9667 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009668 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009669 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009670 if (PyUnicode_READY(string) == -1) {
9671 Py_DECREF(string);
9672 return NULL;
9673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674
Benjamin Petersonead6b532011-12-20 17:23:42 -06009675 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009677 if (PyUnicode_IS_ASCII(string))
9678 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009679 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009680 PyUnicode_GET_LENGTH(string), keepends);
9681 else
9682 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009683 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009684 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 break;
9686 case PyUnicode_2BYTE_KIND:
9687 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009688 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 PyUnicode_GET_LENGTH(string), keepends);
9690 break;
9691 case PyUnicode_4BYTE_KIND:
9692 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009693 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 PyUnicode_GET_LENGTH(string), keepends);
9695 break;
9696 default:
9697 assert(0);
9698 list = 0;
9699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700 Py_DECREF(string);
9701 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702}
9703
Alexander Belopolsky40018472011-02-26 01:02:56 +00009704static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009705split(PyObject *self,
9706 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009707 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 int kind1, kind2, kind;
9710 void *buf1, *buf2;
9711 Py_ssize_t len1, len2;
9712 PyObject* out;
9713
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009715 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 if (PyUnicode_READY(self) == -1)
9718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009721 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009723 if (PyUnicode_IS_ASCII(self))
9724 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009725 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009726 PyUnicode_GET_LENGTH(self), maxcount
9727 );
9728 else
9729 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009730 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009731 PyUnicode_GET_LENGTH(self), maxcount
9732 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 case PyUnicode_2BYTE_KIND:
9734 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009735 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 PyUnicode_GET_LENGTH(self), maxcount
9737 );
9738 case PyUnicode_4BYTE_KIND:
9739 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009740 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 PyUnicode_GET_LENGTH(self), maxcount
9742 );
9743 default:
9744 assert(0);
9745 return NULL;
9746 }
9747
9748 if (PyUnicode_READY(substring) == -1)
9749 return NULL;
9750
9751 kind1 = PyUnicode_KIND(self);
9752 kind2 = PyUnicode_KIND(substring);
9753 kind = kind1 > kind2 ? kind1 : kind2;
9754 buf1 = PyUnicode_DATA(self);
9755 buf2 = PyUnicode_DATA(substring);
9756 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009757 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 if (!buf1)
9759 return NULL;
9760 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009761 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 if (!buf2) {
9763 if (kind1 != kind) PyMem_Free(buf1);
9764 return NULL;
9765 }
9766 len1 = PyUnicode_GET_LENGTH(self);
9767 len2 = PyUnicode_GET_LENGTH(substring);
9768
Benjamin Petersonead6b532011-12-20 17:23:42 -06009769 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009771 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9772 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009773 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009774 else
9775 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009776 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 break;
9778 case PyUnicode_2BYTE_KIND:
9779 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009780 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 break;
9782 case PyUnicode_4BYTE_KIND:
9783 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009784 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 break;
9786 default:
9787 out = NULL;
9788 }
9789 if (kind1 != kind)
9790 PyMem_Free(buf1);
9791 if (kind2 != kind)
9792 PyMem_Free(buf2);
9793 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794}
9795
Alexander Belopolsky40018472011-02-26 01:02:56 +00009796static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009797rsplit(PyObject *self,
9798 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009799 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 int kind1, kind2, kind;
9802 void *buf1, *buf2;
9803 Py_ssize_t len1, len2;
9804 PyObject* out;
9805
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009806 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009807 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 if (PyUnicode_READY(self) == -1)
9810 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009813 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009815 if (PyUnicode_IS_ASCII(self))
9816 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009817 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009818 PyUnicode_GET_LENGTH(self), maxcount
9819 );
9820 else
9821 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009822 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009823 PyUnicode_GET_LENGTH(self), maxcount
9824 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 case PyUnicode_2BYTE_KIND:
9826 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009827 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 PyUnicode_GET_LENGTH(self), maxcount
9829 );
9830 case PyUnicode_4BYTE_KIND:
9831 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009832 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 PyUnicode_GET_LENGTH(self), maxcount
9834 );
9835 default:
9836 assert(0);
9837 return NULL;
9838 }
9839
9840 if (PyUnicode_READY(substring) == -1)
9841 return NULL;
9842
9843 kind1 = PyUnicode_KIND(self);
9844 kind2 = PyUnicode_KIND(substring);
9845 kind = kind1 > kind2 ? kind1 : kind2;
9846 buf1 = PyUnicode_DATA(self);
9847 buf2 = PyUnicode_DATA(substring);
9848 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009849 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 if (!buf1)
9851 return NULL;
9852 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009853 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 if (!buf2) {
9855 if (kind1 != kind) PyMem_Free(buf1);
9856 return NULL;
9857 }
9858 len1 = PyUnicode_GET_LENGTH(self);
9859 len2 = PyUnicode_GET_LENGTH(substring);
9860
Benjamin Petersonead6b532011-12-20 17:23:42 -06009861 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009863 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9864 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009865 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009866 else
9867 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 break;
9870 case PyUnicode_2BYTE_KIND:
9871 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009872 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 break;
9874 case PyUnicode_4BYTE_KIND:
9875 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009876 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 break;
9878 default:
9879 out = NULL;
9880 }
9881 if (kind1 != kind)
9882 PyMem_Free(buf1);
9883 if (kind2 != kind)
9884 PyMem_Free(buf2);
9885 return out;
9886}
9887
9888static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009889anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9890 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009892 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009894 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9895 return asciilib_find(buf1, len1, buf2, len2, offset);
9896 else
9897 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 case PyUnicode_2BYTE_KIND:
9899 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9900 case PyUnicode_4BYTE_KIND:
9901 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9902 }
9903 assert(0);
9904 return -1;
9905}
9906
9907static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009908anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9909 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009911 switch (kind) {
9912 case PyUnicode_1BYTE_KIND:
9913 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9914 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9915 else
9916 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9917 case PyUnicode_2BYTE_KIND:
9918 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9919 case PyUnicode_4BYTE_KIND:
9920 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9921 }
9922 assert(0);
9923 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009924}
9925
Alexander Belopolsky40018472011-02-26 01:02:56 +00009926static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927replace(PyObject *self, PyObject *str1,
9928 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 PyObject *u;
9931 char *sbuf = PyUnicode_DATA(self);
9932 char *buf1 = PyUnicode_DATA(str1);
9933 char *buf2 = PyUnicode_DATA(str2);
9934 int srelease = 0, release1 = 0, release2 = 0;
9935 int skind = PyUnicode_KIND(self);
9936 int kind1 = PyUnicode_KIND(str1);
9937 int kind2 = PyUnicode_KIND(str2);
9938 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9939 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9940 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009941 int mayshrink;
9942 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943
9944 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009945 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009947 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948
Victor Stinner59de0ee2011-10-07 10:01:28 +02009949 if (str1 == str2)
9950 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 if (skind < kind1)
9952 /* substring too wide to be present */
9953 goto nothing;
9954
Victor Stinner49a0a212011-10-12 23:46:10 +02009955 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9956 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9957 /* Replacing str1 with str2 may cause a maxchar reduction in the
9958 result string. */
9959 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009960 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009963 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009965 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009967 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009968 Py_UCS4 u1, u2;
9969 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009970 Py_ssize_t index, pos;
9971 char *src;
9972
Victor Stinner69ed0f42013-04-09 21:48:24 +02009973 u1 = PyUnicode_READ(kind1, buf1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009974 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9975 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009976 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +02009977 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009979 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009981 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009983
9984 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9985 index = 0;
9986 src = sbuf;
9987 while (--maxcount)
9988 {
9989 pos++;
9990 src += pos * PyUnicode_KIND(self);
9991 slen -= pos;
9992 index += pos;
9993 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9994 if (pos < 0)
9995 break;
9996 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9997 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009998 }
9999 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 int rkind = skind;
10001 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010002 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (kind1 < rkind) {
10005 /* widen substring */
10006 buf1 = _PyUnicode_AsKind(str1, rkind);
10007 if (!buf1) goto error;
10008 release1 = 1;
10009 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010010 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010011 if (i < 0)
10012 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 if (rkind > kind2) {
10014 /* widen replacement */
10015 buf2 = _PyUnicode_AsKind(str2, rkind);
10016 if (!buf2) goto error;
10017 release2 = 1;
10018 }
10019 else if (rkind < kind2) {
10020 /* widen self and buf1 */
10021 rkind = kind2;
10022 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010023 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 sbuf = _PyUnicode_AsKind(self, rkind);
10025 if (!sbuf) goto error;
10026 srelease = 1;
10027 buf1 = _PyUnicode_AsKind(str1, rkind);
10028 if (!buf1) goto error;
10029 release1 = 1;
10030 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010031 u = PyUnicode_New(slen, maxchar);
10032 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010034 assert(PyUnicode_KIND(u) == rkind);
10035 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010036
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010037 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010038 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010039 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010041 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010043
10044 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010045 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010046 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010047 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010048 if (i == -1)
10049 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010050 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010052 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010054 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010056 }
10057 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010059 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 int rkind = skind;
10061 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010064 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 buf1 = _PyUnicode_AsKind(str1, rkind);
10066 if (!buf1) goto error;
10067 release1 = 1;
10068 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010069 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010070 if (n == 0)
10071 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010073 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 buf2 = _PyUnicode_AsKind(str2, rkind);
10075 if (!buf2) goto error;
10076 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010079 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 rkind = kind2;
10081 sbuf = _PyUnicode_AsKind(self, rkind);
10082 if (!sbuf) goto error;
10083 srelease = 1;
10084 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010085 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 buf1 = _PyUnicode_AsKind(str1, rkind);
10087 if (!buf1) goto error;
10088 release1 = 1;
10089 }
10090 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10091 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010092 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 PyErr_SetString(PyExc_OverflowError,
10094 "replace string is too long");
10095 goto error;
10096 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010097 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010098 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010099 _Py_INCREF_UNICODE_EMPTY();
10100 if (!unicode_empty)
10101 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010102 u = unicode_empty;
10103 goto done;
10104 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010105 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 PyErr_SetString(PyExc_OverflowError,
10107 "replace string is too long");
10108 goto error;
10109 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010110 u = PyUnicode_New(new_size, maxchar);
10111 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010113 assert(PyUnicode_KIND(u) == rkind);
10114 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 ires = i = 0;
10116 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010117 while (n-- > 0) {
10118 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010119 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010120 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010121 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010122 if (j == -1)
10123 break;
10124 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010125 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010126 memcpy(res + rkind * ires,
10127 sbuf + rkind * i,
10128 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010130 }
10131 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010133 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010135 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010141 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010142 memcpy(res + rkind * ires,
10143 sbuf + rkind * i,
10144 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010145 }
10146 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010147 /* interleave */
10148 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010149 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010151 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010153 if (--n <= 0)
10154 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010155 memcpy(res + rkind * ires,
10156 sbuf + rkind * i,
10157 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 ires++;
10159 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010160 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010161 memcpy(res + rkind * ires,
10162 sbuf + rkind * i,
10163 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010164 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010165 }
10166
10167 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010168 unicode_adjust_maxchar(&u);
10169 if (u == NULL)
10170 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010172
10173 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 if (srelease)
10175 PyMem_FREE(sbuf);
10176 if (release1)
10177 PyMem_FREE(buf1);
10178 if (release2)
10179 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010180 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010182
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010184 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (srelease)
10186 PyMem_FREE(sbuf);
10187 if (release1)
10188 PyMem_FREE(buf1);
10189 if (release2)
10190 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010191 return unicode_result_unchanged(self);
10192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 error:
10194 if (srelease && sbuf)
10195 PyMem_FREE(sbuf);
10196 if (release1 && buf1)
10197 PyMem_FREE(buf1);
10198 if (release2 && buf2)
10199 PyMem_FREE(buf2);
10200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201}
10202
10203/* --- Unicode Object Methods --------------------------------------------- */
10204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010205PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010206 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207\n\
10208Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010209characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
10211static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010212unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010214 if (PyUnicode_READY(self) == -1)
10215 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010216 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217}
10218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010219PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010220 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221\n\
10222Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010223have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
10225static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010226unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010228 if (PyUnicode_READY(self) == -1)
10229 return NULL;
10230 if (PyUnicode_GET_LENGTH(self) == 0)
10231 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010232 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233}
10234
Benjamin Petersond5890c82012-01-14 13:23:30 -050010235PyDoc_STRVAR(casefold__doc__,
10236 "S.casefold() -> str\n\
10237\n\
10238Return a version of S suitable for caseless comparisons.");
10239
10240static PyObject *
10241unicode_casefold(PyObject *self)
10242{
10243 if (PyUnicode_READY(self) == -1)
10244 return NULL;
10245 if (PyUnicode_IS_ASCII(self))
10246 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010247 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010248}
10249
10250
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010251/* Argument converter. Coerces to a single unicode character */
10252
10253static int
10254convert_uc(PyObject *obj, void *addr)
10255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010258
Benjamin Peterson14339b62009-01-31 16:36:08 +000010259 uniobj = PyUnicode_FromObject(obj);
10260 if (uniobj == NULL) {
10261 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010263 return 0;
10264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010266 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010267 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010268 Py_DECREF(uniobj);
10269 return 0;
10270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010272 Py_DECREF(uniobj);
10273 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010274}
10275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010276PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010277 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010279Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010280done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281
10282static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010283unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010285 Py_ssize_t marg, left;
10286 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 Py_UCS4 fillchar = ' ';
10288
Victor Stinnere9a29352011-10-01 02:14:59 +020010289 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291
Benjamin Petersonbac79492012-01-14 13:34:47 -050010292 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293 return NULL;
10294
Victor Stinnerc4b49542011-12-11 22:44:26 +010010295 if (PyUnicode_GET_LENGTH(self) >= width)
10296 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297
Victor Stinnerc4b49542011-12-11 22:44:26 +010010298 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299 left = marg / 2 + (marg & width & 1);
10300
Victor Stinner9310abb2011-10-05 00:59:23 +020010301 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302}
10303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304/* This function assumes that str1 and str2 are readied by the caller. */
10305
Marc-André Lemburge5034372000-08-08 08:04:29 +000010306static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010307unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010308{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010309#define COMPARE(TYPE1, TYPE2) \
10310 do { \
10311 TYPE1* p1 = (TYPE1 *)data1; \
10312 TYPE2* p2 = (TYPE2 *)data2; \
10313 TYPE1* end = p1 + len; \
10314 Py_UCS4 c1, c2; \
10315 for (; p1 != end; p1++, p2++) { \
10316 c1 = *p1; \
10317 c2 = *p2; \
10318 if (c1 != c2) \
10319 return (c1 < c2) ? -1 : 1; \
10320 } \
10321 } \
10322 while (0)
10323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 int kind1, kind2;
10325 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010326 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010327
Victor Stinner90db9c42012-10-04 21:53:50 +020010328 /* a string is equal to itself */
10329 if (str1 == str2)
10330 return 0;
10331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 kind1 = PyUnicode_KIND(str1);
10333 kind2 = PyUnicode_KIND(str2);
10334 data1 = PyUnicode_DATA(str1);
10335 data2 = PyUnicode_DATA(str2);
10336 len1 = PyUnicode_GET_LENGTH(str1);
10337 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010338 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010339
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010340 switch(kind1) {
10341 case PyUnicode_1BYTE_KIND:
10342 {
10343 switch(kind2) {
10344 case PyUnicode_1BYTE_KIND:
10345 {
10346 int cmp = memcmp(data1, data2, len);
10347 /* normalize result of memcmp() into the range [-1; 1] */
10348 if (cmp < 0)
10349 return -1;
10350 if (cmp > 0)
10351 return 1;
10352 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010353 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010354 case PyUnicode_2BYTE_KIND:
10355 COMPARE(Py_UCS1, Py_UCS2);
10356 break;
10357 case PyUnicode_4BYTE_KIND:
10358 COMPARE(Py_UCS1, Py_UCS4);
10359 break;
10360 default:
10361 assert(0);
10362 }
10363 break;
10364 }
10365 case PyUnicode_2BYTE_KIND:
10366 {
10367 switch(kind2) {
10368 case PyUnicode_1BYTE_KIND:
10369 COMPARE(Py_UCS2, Py_UCS1);
10370 break;
10371 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010372 {
10373#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 2
10374 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10375 /* normalize result of wmemcmp() into the range [-1; 1] */
10376 if (cmp < 0)
10377 return -1;
10378 if (cmp > 0)
10379 return 1;
10380#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010381 COMPARE(Py_UCS2, Py_UCS2);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010382#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010383 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010384 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010385 case PyUnicode_4BYTE_KIND:
10386 COMPARE(Py_UCS2, Py_UCS4);
10387 break;
10388 default:
10389 assert(0);
10390 }
10391 break;
10392 }
10393 case PyUnicode_4BYTE_KIND:
10394 {
10395 switch(kind2) {
10396 case PyUnicode_1BYTE_KIND:
10397 COMPARE(Py_UCS4, Py_UCS1);
10398 break;
10399 case PyUnicode_2BYTE_KIND:
10400 COMPARE(Py_UCS4, Py_UCS2);
10401 break;
10402 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010403 {
10404#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10405 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10406 /* normalize result of wmemcmp() into the range [-1; 1] */
10407 if (cmp < 0)
10408 return -1;
10409 if (cmp > 0)
10410 return 1;
10411#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010412 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010413#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010414 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010415 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010416 default:
10417 assert(0);
10418 }
10419 break;
10420 }
10421 default:
10422 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010423 }
10424
Victor Stinner770e19e2012-10-04 22:59:45 +020010425 if (len1 == len2)
10426 return 0;
10427 if (len1 < len2)
10428 return -1;
10429 else
10430 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010431
10432#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010433}
10434
Victor Stinnere5567ad2012-10-23 02:48:49 +020010435static int
10436unicode_compare_eq(PyObject *str1, PyObject *str2)
10437{
10438 int kind;
10439 void *data1, *data2;
10440 Py_ssize_t len;
10441 int cmp;
10442
10443 /* a string is equal to itself */
10444 if (str1 == str2)
10445 return 1;
10446
10447 len = PyUnicode_GET_LENGTH(str1);
10448 if (PyUnicode_GET_LENGTH(str2) != len)
10449 return 0;
10450 kind = PyUnicode_KIND(str1);
10451 if (PyUnicode_KIND(str2) != kind)
10452 return 0;
10453 data1 = PyUnicode_DATA(str1);
10454 data2 = PyUnicode_DATA(str2);
10455
10456 cmp = memcmp(data1, data2, len * kind);
10457 return (cmp == 0);
10458}
10459
10460
Alexander Belopolsky40018472011-02-26 01:02:56 +000010461int
10462PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10465 if (PyUnicode_READY(left) == -1 ||
10466 PyUnicode_READY(right) == -1)
10467 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010468 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010470 PyErr_Format(PyExc_TypeError,
10471 "Can't compare %.100s and %.100s",
10472 left->ob_type->tp_name,
10473 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010474 return -1;
10475}
10476
Martin v. Löwis5b222132007-06-10 09:51:05 +000010477int
10478PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 Py_ssize_t i;
10481 int kind;
10482 void *data;
10483 Py_UCS4 chr;
10484
Victor Stinner910337b2011-10-03 03:20:16 +020010485 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (PyUnicode_READY(uni) == -1)
10487 return -1;
10488 kind = PyUnicode_KIND(uni);
10489 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010490 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10492 if (chr != str[i])
10493 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010494 /* This check keeps Python strings that end in '\0' from comparing equal
10495 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010497 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010498 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010500 return 0;
10501}
10502
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010503
Benjamin Peterson29060642009-01-31 22:14:21 +000010504#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010505 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010506
Alexander Belopolsky40018472011-02-26 01:02:56 +000010507PyObject *
10508PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010509{
10510 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010511 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010512
Victor Stinnere5567ad2012-10-23 02:48:49 +020010513 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10514 Py_RETURN_NOTIMPLEMENTED;
10515
10516 if (PyUnicode_READY(left) == -1 ||
10517 PyUnicode_READY(right) == -1)
10518 return NULL;
10519
10520 if (op == Py_EQ || op == Py_NE) {
10521 result = unicode_compare_eq(left, right);
10522 if (op == Py_EQ)
10523 v = TEST_COND(result);
10524 else
10525 v = TEST_COND(!result);
10526 }
10527 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010528 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010529
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010530 /* Convert the return value to a Boolean */
10531 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010532 case Py_LE:
10533 v = TEST_COND(result <= 0);
10534 break;
10535 case Py_GE:
10536 v = TEST_COND(result >= 0);
10537 break;
10538 case Py_LT:
10539 v = TEST_COND(result == -1);
10540 break;
10541 case Py_GT:
10542 v = TEST_COND(result == 1);
10543 break;
10544 default:
10545 PyErr_BadArgument();
10546 return NULL;
10547 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010548 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010549 Py_INCREF(v);
10550 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010551}
10552
Alexander Belopolsky40018472011-02-26 01:02:56 +000010553int
10554PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010555{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010556 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 int kind1, kind2, kind;
10558 void *buf1, *buf2;
10559 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010560 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010561
10562 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010563 sub = PyUnicode_FromObject(element);
10564 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010565 PyErr_Format(PyExc_TypeError,
10566 "'in <string>' requires string as left operand, not %s",
10567 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010568 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010569 }
10570
Thomas Wouters477c8d52006-05-27 19:21:47 +000010571 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010572 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010573 Py_DECREF(sub);
10574 return -1;
10575 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010576 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10577 Py_DECREF(sub);
10578 Py_DECREF(str);
10579 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 kind1 = PyUnicode_KIND(str);
10582 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010583 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 buf1 = PyUnicode_DATA(str);
10585 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010586 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010587 if (kind2 > kind) {
10588 Py_DECREF(sub);
10589 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010590 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010591 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010592 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (!buf2) {
10595 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010596 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 return -1;
10598 }
10599 len1 = PyUnicode_GET_LENGTH(str);
10600 len2 = PyUnicode_GET_LENGTH(sub);
10601
Benjamin Petersonead6b532011-12-20 17:23:42 -060010602 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 case PyUnicode_1BYTE_KIND:
10604 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10605 break;
10606 case PyUnicode_2BYTE_KIND:
10607 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10608 break;
10609 case PyUnicode_4BYTE_KIND:
10610 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10611 break;
10612 default:
10613 result = -1;
10614 assert(0);
10615 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010616
10617 Py_DECREF(str);
10618 Py_DECREF(sub);
10619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 if (kind2 != kind)
10621 PyMem_Free(buf2);
10622
Guido van Rossum403d68b2000-03-13 15:55:09 +000010623 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010624}
10625
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626/* Concat to string or Unicode object giving a new Unicode object. */
10627
Alexander Belopolsky40018472011-02-26 01:02:56 +000010628PyObject *
10629PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010632 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010633 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634
10635 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010638 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010641 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642
10643 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010644 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010645 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010648 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010649 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651 }
10652
Victor Stinner488fa492011-12-12 00:01:39 +010010653 u_len = PyUnicode_GET_LENGTH(u);
10654 v_len = PyUnicode_GET_LENGTH(v);
10655 if (u_len > PY_SSIZE_T_MAX - v_len) {
10656 PyErr_SetString(PyExc_OverflowError,
10657 "strings are too large to concat");
10658 goto onError;
10659 }
10660 new_len = u_len + v_len;
10661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010663 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010664 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010667 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010670 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10671 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672 Py_DECREF(u);
10673 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010674 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676
Benjamin Peterson29060642009-01-31 22:14:21 +000010677 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 Py_XDECREF(u);
10679 Py_XDECREF(v);
10680 return NULL;
10681}
10682
Walter Dörwald1ab83302007-05-18 17:15:44 +000010683void
Victor Stinner23e56682011-10-03 03:54:37 +020010684PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010685{
Victor Stinner23e56682011-10-03 03:54:37 +020010686 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010687 Py_UCS4 maxchar, maxchar2;
10688 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010689
10690 if (p_left == NULL) {
10691 if (!PyErr_Occurred())
10692 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010693 return;
10694 }
Victor Stinner23e56682011-10-03 03:54:37 +020010695 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010696 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010697 if (!PyErr_Occurred())
10698 PyErr_BadInternalCall();
10699 goto error;
10700 }
10701
Benjamin Petersonbac79492012-01-14 13:34:47 -050010702 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010703 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010704 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010705 goto error;
10706
Victor Stinner488fa492011-12-12 00:01:39 +010010707 /* Shortcuts */
10708 if (left == unicode_empty) {
10709 Py_DECREF(left);
10710 Py_INCREF(right);
10711 *p_left = right;
10712 return;
10713 }
10714 if (right == unicode_empty)
10715 return;
10716
10717 left_len = PyUnicode_GET_LENGTH(left);
10718 right_len = PyUnicode_GET_LENGTH(right);
10719 if (left_len > PY_SSIZE_T_MAX - right_len) {
10720 PyErr_SetString(PyExc_OverflowError,
10721 "strings are too large to concat");
10722 goto error;
10723 }
10724 new_len = left_len + right_len;
10725
10726 if (unicode_modifiable(left)
10727 && PyUnicode_CheckExact(right)
10728 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010729 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10730 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010731 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010732 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010733 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10734 {
10735 /* append inplace */
10736 if (unicode_resize(p_left, new_len) != 0) {
10737 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10738 * deallocated so it cannot be put back into
10739 * 'variable'. The MemoryError is raised when there
10740 * is no value in 'variable', which might (very
10741 * remotely) be a cause of incompatibilities.
10742 */
10743 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010744 }
Victor Stinner488fa492011-12-12 00:01:39 +010010745 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010746 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010747 }
Victor Stinner488fa492011-12-12 00:01:39 +010010748 else {
10749 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10750 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010751 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010752
Victor Stinner488fa492011-12-12 00:01:39 +010010753 /* Concat the two Unicode strings */
10754 res = PyUnicode_New(new_len, maxchar);
10755 if (res == NULL)
10756 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010757 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10758 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010759 Py_DECREF(left);
10760 *p_left = res;
10761 }
10762 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010763 return;
10764
10765error:
Victor Stinner488fa492011-12-12 00:01:39 +010010766 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010767}
10768
10769void
10770PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10771{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010772 PyUnicode_Append(pleft, right);
10773 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010774}
10775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010776PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010777 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010779Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010780string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010781interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782
10783static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010784unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010786 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010787 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010788 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 int kind1, kind2, kind;
10791 void *buf1, *buf2;
10792 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793
Jesus Ceaac451502011-04-20 17:09:23 +020010794 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10795 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 kind1 = PyUnicode_KIND(self);
10799 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010800 if (kind2 > kind1)
10801 return PyLong_FromLong(0);
10802 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 buf1 = PyUnicode_DATA(self);
10804 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010806 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 if (!buf2) {
10808 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 return NULL;
10810 }
10811 len1 = PyUnicode_GET_LENGTH(self);
10812 len2 = PyUnicode_GET_LENGTH(substring);
10813
10814 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010815 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 case PyUnicode_1BYTE_KIND:
10817 iresult = ucs1lib_count(
10818 ((Py_UCS1*)buf1) + start, end - start,
10819 buf2, len2, PY_SSIZE_T_MAX
10820 );
10821 break;
10822 case PyUnicode_2BYTE_KIND:
10823 iresult = ucs2lib_count(
10824 ((Py_UCS2*)buf1) + start, end - start,
10825 buf2, len2, PY_SSIZE_T_MAX
10826 );
10827 break;
10828 case PyUnicode_4BYTE_KIND:
10829 iresult = ucs4lib_count(
10830 ((Py_UCS4*)buf1) + start, end - start,
10831 buf2, len2, PY_SSIZE_T_MAX
10832 );
10833 break;
10834 default:
10835 assert(0); iresult = 0;
10836 }
10837
10838 result = PyLong_FromSsize_t(iresult);
10839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 if (kind2 != kind)
10841 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
10843 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010844
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845 return result;
10846}
10847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010848PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010849 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010851Encode S using the codec registered for encoding. Default encoding\n\
10852is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010853handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010854a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10855'xmlcharrefreplace' as well as any other name registered with\n\
10856codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857
10858static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010859unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010861 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862 char *encoding = NULL;
10863 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010864
Benjamin Peterson308d6372009-09-18 21:42:35 +000010865 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10866 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010868 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010869}
10870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010871PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010872 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873\n\
10874Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010875If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
10877static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010878unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010880 Py_ssize_t i, j, line_pos, src_len, incr;
10881 Py_UCS4 ch;
10882 PyObject *u;
10883 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010885 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010886 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887
10888 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
Antoine Pitrou22425222011-10-04 19:10:51 +020010891 if (PyUnicode_READY(self) == -1)
10892 return NULL;
10893
Thomas Wouters7e474022000-07-16 12:04:32 +000010894 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 src_len = PyUnicode_GET_LENGTH(self);
10896 i = j = line_pos = 0;
10897 kind = PyUnicode_KIND(self);
10898 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010899 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010900 for (; i < src_len; i++) {
10901 ch = PyUnicode_READ(kind, src_data, i);
10902 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010903 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010904 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010907 goto overflow;
10908 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010910 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010914 goto overflow;
10915 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010917 if (ch == '\n' || ch == '\r')
10918 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010921 if (!found)
10922 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010923
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010925 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926 if (!u)
10927 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010928 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929
Antoine Pitroue71d5742011-10-04 15:55:09 +020010930 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
Antoine Pitroue71d5742011-10-04 15:55:09 +020010932 for (; i < src_len; i++) {
10933 ch = PyUnicode_READ(kind, src_data, i);
10934 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010935 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010936 incr = tabsize - (line_pos % tabsize);
10937 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010938 FILL(kind, dest_data, ' ', j, incr);
10939 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010941 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010943 line_pos++;
10944 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010945 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010946 if (ch == '\n' || ch == '\r')
10947 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010949 }
10950 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010951 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010952
Antoine Pitroue71d5742011-10-04 15:55:09 +020010953 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010954 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956}
10957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010958PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010959 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960\n\
10961Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010962such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963arguments start and end are interpreted as in slice notation.\n\
10964\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010965Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
10967static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010970 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010971 Py_ssize_t start;
10972 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010973 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974
Jesus Ceaac451502011-04-20 17:09:23 +020010975 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10976 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 if (PyUnicode_READY(self) == -1)
10980 return NULL;
10981 if (PyUnicode_READY(substring) == -1)
10982 return NULL;
10983
Victor Stinner7931d9a2011-11-04 00:22:48 +010010984 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985
10986 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 if (result == -2)
10989 return NULL;
10990
Christian Heimes217cfd12007-12-02 14:31:20 +000010991 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992}
10993
10994static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010995unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010997 void *data;
10998 enum PyUnicode_Kind kind;
10999 Py_UCS4 ch;
11000 PyObject *res;
11001
11002 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11003 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011005 }
11006 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11007 PyErr_SetString(PyExc_IndexError, "string index out of range");
11008 return NULL;
11009 }
11010 kind = PyUnicode_KIND(self);
11011 data = PyUnicode_DATA(self);
11012 ch = PyUnicode_READ(kind, data, index);
11013 if (ch < 256)
11014 return get_latin1_char(ch);
11015
11016 res = PyUnicode_New(1, ch);
11017 if (res == NULL)
11018 return NULL;
11019 kind = PyUnicode_KIND(res);
11020 data = PyUnicode_DATA(res);
11021 PyUnicode_WRITE(kind, data, 0, ch);
11022 assert(_PyUnicode_CheckConsistency(res, 1));
11023 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024}
11025
Guido van Rossumc2504932007-09-18 19:42:40 +000011026/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011027 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011028static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011029unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030{
Guido van Rossumc2504932007-09-18 19:42:40 +000011031 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011032 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011033
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011034#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011035 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011036#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 if (_PyUnicode_HASH(self) != -1)
11038 return _PyUnicode_HASH(self);
11039 if (PyUnicode_READY(self) == -1)
11040 return -1;
11041 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011042 /*
11043 We make the hash of the empty string be 0, rather than using
11044 (prefix ^ suffix), since this slightly obfuscates the hash secret
11045 */
11046 if (len == 0) {
11047 _PyUnicode_HASH(self) = 0;
11048 return 0;
11049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050
11051 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011052#define HASH(P) \
11053 x ^= (Py_uhash_t) *P << 7; \
11054 while (--len >= 0) \
11055 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011056
Georg Brandl2fb477c2012-02-21 00:33:36 +010011057 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011058 switch (PyUnicode_KIND(self)) {
11059 case PyUnicode_1BYTE_KIND: {
11060 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11061 HASH(c);
11062 break;
11063 }
11064 case PyUnicode_2BYTE_KIND: {
11065 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11066 HASH(s);
11067 break;
11068 }
11069 default: {
11070 Py_UCS4 *l;
11071 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11072 "Impossible switch case in unicode_hash");
11073 l = PyUnicode_4BYTE_DATA(self);
11074 HASH(l);
11075 break;
11076 }
11077 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011078 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11079 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080
Guido van Rossumc2504932007-09-18 19:42:40 +000011081 if (x == -1)
11082 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011084 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011088PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011089 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011091Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092
11093static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011096 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011097 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011098 Py_ssize_t start;
11099 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100
Jesus Ceaac451502011-04-20 17:09:23 +020011101 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11102 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 if (PyUnicode_READY(self) == -1)
11106 return NULL;
11107 if (PyUnicode_READY(substring) == -1)
11108 return NULL;
11109
Victor Stinner7931d9a2011-11-04 00:22:48 +010011110 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111
11112 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 if (result == -2)
11115 return NULL;
11116
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117 if (result < 0) {
11118 PyErr_SetString(PyExc_ValueError, "substring not found");
11119 return NULL;
11120 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011121
Christian Heimes217cfd12007-12-02 14:31:20 +000011122 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123}
11124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011125PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011126 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011128Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011129at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130
11131static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011132unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 Py_ssize_t i, length;
11135 int kind;
11136 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 int cased;
11138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 if (PyUnicode_READY(self) == -1)
11140 return NULL;
11141 length = PyUnicode_GET_LENGTH(self);
11142 kind = PyUnicode_KIND(self);
11143 data = PyUnicode_DATA(self);
11144
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 if (length == 1)
11147 return PyBool_FromLong(
11148 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011150 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011153
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 for (i = 0; i < length; i++) {
11156 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011157
Benjamin Peterson29060642009-01-31 22:14:21 +000011158 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11159 return PyBool_FromLong(0);
11160 else if (!cased && Py_UNICODE_ISLOWER(ch))
11161 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011163 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164}
11165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011166PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011167 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011169Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011170at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
11172static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011173unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 Py_ssize_t i, length;
11176 int kind;
11177 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178 int cased;
11179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (PyUnicode_READY(self) == -1)
11181 return NULL;
11182 length = PyUnicode_GET_LENGTH(self);
11183 kind = PyUnicode_KIND(self);
11184 data = PyUnicode_DATA(self);
11185
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 if (length == 1)
11188 return PyBool_FromLong(
11189 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011191 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011193 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 for (i = 0; i < length; i++) {
11197 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011198
Benjamin Peterson29060642009-01-31 22:14:21 +000011199 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11200 return PyBool_FromLong(0);
11201 else if (!cased && Py_UNICODE_ISUPPER(ch))
11202 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011204 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205}
11206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011207PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011208 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011210Return True if S is a titlecased string and there is at least one\n\
11211character in S, i.e. upper- and titlecase characters may only\n\
11212follow uncased characters and lowercase characters only cased ones.\n\
11213Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214
11215static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011216unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 Py_ssize_t i, length;
11219 int kind;
11220 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221 int cased, previous_is_cased;
11222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011223 if (PyUnicode_READY(self) == -1)
11224 return NULL;
11225 length = PyUnicode_GET_LENGTH(self);
11226 kind = PyUnicode_KIND(self);
11227 data = PyUnicode_DATA(self);
11228
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 if (length == 1) {
11231 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11232 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11233 (Py_UNICODE_ISUPPER(ch) != 0));
11234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011236 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011239
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 cased = 0;
11241 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 for (i = 0; i < length; i++) {
11243 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011244
Benjamin Peterson29060642009-01-31 22:14:21 +000011245 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11246 if (previous_is_cased)
11247 return PyBool_FromLong(0);
11248 previous_is_cased = 1;
11249 cased = 1;
11250 }
11251 else if (Py_UNICODE_ISLOWER(ch)) {
11252 if (!previous_is_cased)
11253 return PyBool_FromLong(0);
11254 previous_is_cased = 1;
11255 cased = 1;
11256 }
11257 else
11258 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011260 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261}
11262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011263PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011266Return True if all characters in S are whitespace\n\
11267and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268
11269static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011270unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 Py_ssize_t i, length;
11273 int kind;
11274 void *data;
11275
11276 if (PyUnicode_READY(self) == -1)
11277 return NULL;
11278 length = PyUnicode_GET_LENGTH(self);
11279 kind = PyUnicode_KIND(self);
11280 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (length == 1)
11284 return PyBool_FromLong(
11285 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011287 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011289 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 for (i = 0; i < length; i++) {
11292 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011293 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011296 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297}
11298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011299PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011301\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011302Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011303and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011304
11305static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011306unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 Py_ssize_t i, length;
11309 int kind;
11310 void *data;
11311
11312 if (PyUnicode_READY(self) == -1)
11313 return NULL;
11314 length = PyUnicode_GET_LENGTH(self);
11315 kind = PyUnicode_KIND(self);
11316 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011317
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011318 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (length == 1)
11320 return PyBool_FromLong(
11321 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011322
11323 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011325 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 for (i = 0; i < length; i++) {
11328 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011329 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011331 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011332}
11333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011334PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011336\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011337Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011338and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011339
11340static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011341unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 int kind;
11344 void *data;
11345 Py_ssize_t len, i;
11346
11347 if (PyUnicode_READY(self) == -1)
11348 return NULL;
11349
11350 kind = PyUnicode_KIND(self);
11351 data = PyUnicode_DATA(self);
11352 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011353
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011354 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 if (len == 1) {
11356 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11357 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11358 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011359
11360 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 for (i = 0; i < len; i++) {
11365 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011366 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011368 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011369 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011370}
11371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011372PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011373 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011375Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011376False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377
11378static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011379unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 Py_ssize_t i, length;
11382 int kind;
11383 void *data;
11384
11385 if (PyUnicode_READY(self) == -1)
11386 return NULL;
11387 length = PyUnicode_GET_LENGTH(self);
11388 kind = PyUnicode_KIND(self);
11389 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 if (length == 1)
11393 return PyBool_FromLong(
11394 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011396 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011398 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 for (i = 0; i < length; i++) {
11401 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011404 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405}
11406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011410Return True if all characters in S are digits\n\
11411and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
11413static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011414unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 Py_ssize_t i, length;
11417 int kind;
11418 void *data;
11419
11420 if (PyUnicode_READY(self) == -1)
11421 return NULL;
11422 length = PyUnicode_GET_LENGTH(self);
11423 kind = PyUnicode_KIND(self);
11424 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (length == 1) {
11428 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11429 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011432 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 for (i = 0; i < length; i++) {
11437 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011440 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441}
11442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011446Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011447False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
11449static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011450unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 Py_ssize_t i, length;
11453 int kind;
11454 void *data;
11455
11456 if (PyUnicode_READY(self) == -1)
11457 return NULL;
11458 length = PyUnicode_GET_LENGTH(self);
11459 kind = PyUnicode_KIND(self);
11460 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 1)
11464 return PyBool_FromLong(
11465 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011467 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 for (i = 0; i < length; i++) {
11472 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011475 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476}
11477
Martin v. Löwis47383402007-08-15 07:32:56 +000011478int
11479PyUnicode_IsIdentifier(PyObject *self)
11480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 int kind;
11482 void *data;
11483 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011484 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 if (PyUnicode_READY(self) == -1) {
11487 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 }
11490
11491 /* Special case for empty strings */
11492 if (PyUnicode_GET_LENGTH(self) == 0)
11493 return 0;
11494 kind = PyUnicode_KIND(self);
11495 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011496
11497 /* PEP 3131 says that the first character must be in
11498 XID_Start and subsequent characters in XID_Continue,
11499 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011500 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011501 letters, digits, underscore). However, given the current
11502 definition of XID_Start and XID_Continue, it is sufficient
11503 to check just for these, except that _ must be allowed
11504 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011506 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011507 return 0;
11508
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011509 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011512 return 1;
11513}
11514
11515PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011517\n\
11518Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011519to the language definition.\n\
11520\n\
11521Use keyword.iskeyword() to test for reserved identifiers\n\
11522such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011523
11524static PyObject*
11525unicode_isidentifier(PyObject *self)
11526{
11527 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11528}
11529
Georg Brandl559e5d72008-06-11 18:37:52 +000011530PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011532\n\
11533Return True if all characters in S are considered\n\
11534printable in repr() or S is empty, False otherwise.");
11535
11536static PyObject*
11537unicode_isprintable(PyObject *self)
11538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 Py_ssize_t i, length;
11540 int kind;
11541 void *data;
11542
11543 if (PyUnicode_READY(self) == -1)
11544 return NULL;
11545 length = PyUnicode_GET_LENGTH(self);
11546 kind = PyUnicode_KIND(self);
11547 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011548
11549 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 if (length == 1)
11551 return PyBool_FromLong(
11552 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 for (i = 0; i < length; i++) {
11555 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011556 Py_RETURN_FALSE;
11557 }
11558 }
11559 Py_RETURN_TRUE;
11560}
11561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011562PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011563 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564\n\
11565Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011566iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
11568static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011569unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011571 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572}
11573
Martin v. Löwis18e16552006-02-15 17:27:45 +000011574static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011575unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 if (PyUnicode_READY(self) == -1)
11578 return -1;
11579 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580}
11581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011582PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011585Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011586done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
11588static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011589unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011591 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 Py_UCS4 fillchar = ' ';
11593
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011594 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595 return NULL;
11596
Benjamin Petersonbac79492012-01-14 13:34:47 -050011597 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599
Victor Stinnerc4b49542011-12-11 22:44:26 +010011600 if (PyUnicode_GET_LENGTH(self) >= width)
11601 return unicode_result_unchanged(self);
11602
11603 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604}
11605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011606PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
11611static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011612unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011614 if (PyUnicode_READY(self) == -1)
11615 return NULL;
11616 if (PyUnicode_IS_ASCII(self))
11617 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011618 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619}
11620
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011621#define LEFTSTRIP 0
11622#define RIGHTSTRIP 1
11623#define BOTHSTRIP 2
11624
11625/* Arrays indexed by above */
11626static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11627
11628#define STRIPNAME(i) (stripformat[i]+3)
11629
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011630/* externally visible for str.strip(unicode) */
11631PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011632_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 void *data;
11635 int kind;
11636 Py_ssize_t i, j, len;
11637 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11640 return NULL;
11641
11642 kind = PyUnicode_KIND(self);
11643 data = PyUnicode_DATA(self);
11644 len = PyUnicode_GET_LENGTH(self);
11645 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11646 PyUnicode_DATA(sepobj),
11647 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011648
Benjamin Peterson14339b62009-01-31 16:36:08 +000011649 i = 0;
11650 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 while (i < len &&
11652 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 i++;
11654 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011655 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011656
Benjamin Peterson14339b62009-01-31 16:36:08 +000011657 j = len;
11658 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 do {
11660 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 } while (j >= i &&
11662 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011664 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011665
Victor Stinner7931d9a2011-11-04 00:22:48 +010011666 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667}
11668
11669PyObject*
11670PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11671{
11672 unsigned char *data;
11673 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011674 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675
Victor Stinnerde636f32011-10-01 03:55:54 +020011676 if (PyUnicode_READY(self) == -1)
11677 return NULL;
11678
Victor Stinner684d5fd2012-05-03 02:32:34 +020011679 length = PyUnicode_GET_LENGTH(self);
11680 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011681
Victor Stinner684d5fd2012-05-03 02:32:34 +020011682 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011683 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684
Victor Stinnerde636f32011-10-01 03:55:54 +020011685 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011686 PyErr_SetString(PyExc_IndexError, "string index out of range");
11687 return NULL;
11688 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011689 if (start >= length || end < start)
11690 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011691
Victor Stinner684d5fd2012-05-03 02:32:34 +020011692 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011693 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011694 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011695 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011696 }
11697 else {
11698 kind = PyUnicode_KIND(self);
11699 data = PyUnicode_1BYTE_DATA(self);
11700 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011701 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011702 length);
11703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705
11706static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011707do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 int kind;
11710 void *data;
11711 Py_ssize_t len, i, j;
11712
11713 if (PyUnicode_READY(self) == -1)
11714 return NULL;
11715
11716 kind = PyUnicode_KIND(self);
11717 data = PyUnicode_DATA(self);
11718 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011719
Benjamin Peterson14339b62009-01-31 16:36:08 +000011720 i = 0;
11721 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011723 i++;
11724 }
11725 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011726
Benjamin Peterson14339b62009-01-31 16:36:08 +000011727 j = len;
11728 if (striptype != LEFTSTRIP) {
11729 do {
11730 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011732 j++;
11733 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734
Victor Stinner7931d9a2011-11-04 00:22:48 +010011735 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736}
11737
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011738
11739static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011740do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011741{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011742 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011743
Benjamin Peterson14339b62009-01-31 16:36:08 +000011744 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11745 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011746
Benjamin Peterson14339b62009-01-31 16:36:08 +000011747 if (sep != NULL && sep != Py_None) {
11748 if (PyUnicode_Check(sep))
11749 return _PyUnicode_XStrip(self, striptype, sep);
11750 else {
11751 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 "%s arg must be None or str",
11753 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011754 return NULL;
11755 }
11756 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757
Benjamin Peterson14339b62009-01-31 16:36:08 +000011758 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011759}
11760
11761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011764\n\
11765Return a copy of the string S with leading and trailing\n\
11766whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011767If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011768
11769static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011770unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011771{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011772 if (PyTuple_GET_SIZE(args) == 0)
11773 return do_strip(self, BOTHSTRIP); /* Common case */
11774 else
11775 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011776}
11777
11778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011779PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011781\n\
11782Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011783If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011784
11785static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011786unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 if (PyTuple_GET_SIZE(args) == 0)
11789 return do_strip(self, LEFTSTRIP); /* Common case */
11790 else
11791 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011792}
11793
11794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011795PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011797\n\
11798Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011799If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011800
11801static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011802unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011803{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011804 if (PyTuple_GET_SIZE(args) == 0)
11805 return do_strip(self, RIGHTSTRIP); /* Common case */
11806 else
11807 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011808}
11809
11810
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011812unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011814 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816
Serhiy Storchaka05997252013-01-26 12:14:02 +020011817 if (len < 1)
11818 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
Victor Stinnerc4b49542011-12-11 22:44:26 +010011820 /* no repeat, return original string */
11821 if (len == 1)
11822 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011823
Benjamin Petersonbac79492012-01-14 13:34:47 -050011824 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 return NULL;
11826
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011827 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011828 PyErr_SetString(PyExc_OverflowError,
11829 "repeated string is too long");
11830 return NULL;
11831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011833
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011834 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 if (!u)
11836 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011837 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 if (PyUnicode_GET_LENGTH(str) == 1) {
11840 const int kind = PyUnicode_KIND(str);
11841 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011842 if (kind == PyUnicode_1BYTE_KIND) {
11843 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011844 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011845 }
11846 else if (kind == PyUnicode_2BYTE_KIND) {
11847 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011848 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011849 ucs2[n] = fill_char;
11850 } else {
11851 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11852 assert(kind == PyUnicode_4BYTE_KIND);
11853 for (n = 0; n < len; ++n)
11854 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 }
11857 else {
11858 /* number of characters copied this far */
11859 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011860 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 char *to = (char *) PyUnicode_DATA(u);
11862 Py_MEMCPY(to, PyUnicode_DATA(str),
11863 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 n = (done <= nchars-done) ? done : nchars-done;
11866 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011867 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869 }
11870
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011871 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011872 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873}
11874
Alexander Belopolsky40018472011-02-26 01:02:56 +000011875PyObject *
11876PyUnicode_Replace(PyObject *obj,
11877 PyObject *subobj,
11878 PyObject *replobj,
11879 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880{
11881 PyObject *self;
11882 PyObject *str1;
11883 PyObject *str2;
11884 PyObject *result;
11885
11886 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011887 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011890 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 Py_DECREF(self);
11892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893 }
11894 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011895 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 Py_DECREF(self);
11897 Py_DECREF(str1);
11898 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011900 if (PyUnicode_READY(self) == -1 ||
11901 PyUnicode_READY(str1) == -1 ||
11902 PyUnicode_READY(str2) == -1)
11903 result = NULL;
11904 else
11905 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906 Py_DECREF(self);
11907 Py_DECREF(str1);
11908 Py_DECREF(str2);
11909 return result;
11910}
11911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011912PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011913 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914\n\
11915Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011916old replaced by new. If the optional argument count is\n\
11917given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918
11919static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 PyObject *str1;
11923 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011924 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 PyObject *result;
11926
Martin v. Löwis18e16552006-02-15 17:27:45 +000011927 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011929 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011932 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 return NULL;
11934 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011935 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 Py_DECREF(str1);
11937 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011938 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011939 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11940 result = NULL;
11941 else
11942 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943
11944 Py_DECREF(str1);
11945 Py_DECREF(str2);
11946 return result;
11947}
11948
Alexander Belopolsky40018472011-02-26 01:02:56 +000011949static PyObject *
11950unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011952 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 Py_ssize_t isize;
11954 Py_ssize_t osize, squote, dquote, i, o;
11955 Py_UCS4 max, quote;
11956 int ikind, okind;
11957 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011960 return NULL;
11961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 isize = PyUnicode_GET_LENGTH(unicode);
11963 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 /* Compute length of output, quote characters, and
11966 maximum character */
11967 osize = 2; /* quotes */
11968 max = 127;
11969 squote = dquote = 0;
11970 ikind = PyUnicode_KIND(unicode);
11971 for (i = 0; i < isize; i++) {
11972 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11973 switch (ch) {
11974 case '\'': squote++; osize++; break;
11975 case '"': dquote++; osize++; break;
11976 case '\\': case '\t': case '\r': case '\n':
11977 osize += 2; break;
11978 default:
11979 /* Fast-path ASCII */
11980 if (ch < ' ' || ch == 0x7f)
11981 osize += 4; /* \xHH */
11982 else if (ch < 0x7f)
11983 osize++;
11984 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11985 osize++;
11986 max = ch > max ? ch : max;
11987 }
11988 else if (ch < 0x100)
11989 osize += 4; /* \xHH */
11990 else if (ch < 0x10000)
11991 osize += 6; /* \uHHHH */
11992 else
11993 osize += 10; /* \uHHHHHHHH */
11994 }
11995 }
11996
11997 quote = '\'';
11998 if (squote) {
11999 if (dquote)
12000 /* Both squote and dquote present. Use squote,
12001 and escape them */
12002 osize += squote;
12003 else
12004 quote = '"';
12005 }
12006
12007 repr = PyUnicode_New(osize, max);
12008 if (repr == NULL)
12009 return NULL;
12010 okind = PyUnicode_KIND(repr);
12011 odata = PyUnicode_DATA(repr);
12012
12013 PyUnicode_WRITE(okind, odata, 0, quote);
12014 PyUnicode_WRITE(okind, odata, osize-1, quote);
12015
12016 for (i = 0, o = 1; i < isize; i++) {
12017 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012018
12019 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 if ((ch == quote) || (ch == '\\')) {
12021 PyUnicode_WRITE(okind, odata, o++, '\\');
12022 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012023 continue;
12024 }
12025
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012027 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 PyUnicode_WRITE(okind, odata, o++, '\\');
12029 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012030 }
12031 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 PyUnicode_WRITE(okind, odata, o++, '\\');
12033 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012034 }
12035 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 PyUnicode_WRITE(okind, odata, o++, '\\');
12037 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012038 }
12039
12040 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012041 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 PyUnicode_WRITE(okind, odata, o++, '\\');
12043 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012044 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12045 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012046 }
12047
Georg Brandl559e5d72008-06-11 18:37:52 +000012048 /* Copy ASCII characters as-is */
12049 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012051 }
12052
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012054 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012055 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012056 (categories Z* and C* except ASCII space)
12057 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012059 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012060 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12064 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012065 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012066 /* Map 16-bit characters to '\uxxxx' */
12067 else if (ch <= 0xffff) {
12068 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12071 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12072 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012073 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012074 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012075 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012076 PyUnicode_WRITE(okind, odata, o++, 'U');
12077 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12078 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12079 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12080 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012081 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12084 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012085 }
12086 }
12087 /* Copy characters as-is */
12088 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012090 }
12091 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012094 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012095 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096}
12097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012098PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100\n\
12101Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012102such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103arguments start and end are interpreted as in slice notation.\n\
12104\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012105Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106
12107static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012110 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012111 Py_ssize_t start;
12112 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012113 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114
Jesus Ceaac451502011-04-20 17:09:23 +020012115 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12116 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012117 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 if (PyUnicode_READY(self) == -1)
12120 return NULL;
12121 if (PyUnicode_READY(substring) == -1)
12122 return NULL;
12123
Victor Stinner7931d9a2011-11-04 00:22:48 +010012124 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125
12126 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (result == -2)
12129 return NULL;
12130
Christian Heimes217cfd12007-12-02 14:31:20 +000012131 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132}
12133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012134PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012137Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138
12139static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012142 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012143 Py_ssize_t start;
12144 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012145 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
Jesus Ceaac451502011-04-20 17:09:23 +020012147 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12148 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 if (PyUnicode_READY(self) == -1)
12152 return NULL;
12153 if (PyUnicode_READY(substring) == -1)
12154 return NULL;
12155
Victor Stinner7931d9a2011-11-04 00:22:48 +010012156 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157
12158 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 if (result == -2)
12161 return NULL;
12162
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163 if (result < 0) {
12164 PyErr_SetString(PyExc_ValueError, "substring not found");
12165 return NULL;
12166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167
Christian Heimes217cfd12007-12-02 14:31:20 +000012168 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169}
12170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012171PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012174Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012175done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
12177static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012178unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012180 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 Py_UCS4 fillchar = ' ';
12182
Victor Stinnere9a29352011-10-01 02:14:59 +020012183 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012185
Benjamin Petersonbac79492012-01-14 13:34:47 -050012186 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 return NULL;
12188
Victor Stinnerc4b49542011-12-11 22:44:26 +010012189 if (PyUnicode_GET_LENGTH(self) >= width)
12190 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
Victor Stinnerc4b49542011-12-11 22:44:26 +010012192 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193}
12194
Alexander Belopolsky40018472011-02-26 01:02:56 +000012195PyObject *
12196PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197{
12198 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012199
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200 s = PyUnicode_FromObject(s);
12201 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012202 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 if (sep != NULL) {
12204 sep = PyUnicode_FromObject(sep);
12205 if (sep == NULL) {
12206 Py_DECREF(s);
12207 return NULL;
12208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209 }
12210
Victor Stinner9310abb2011-10-05 00:59:23 +020012211 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
12213 Py_DECREF(s);
12214 Py_XDECREF(sep);
12215 return result;
12216}
12217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012218PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012219 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220\n\
12221Return a list of the words in S, using sep as the\n\
12222delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012223splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012224whitespace string is a separator and empty strings are\n\
12225removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226
12227static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012228unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012230 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012232 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012234 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12235 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236 return NULL;
12237
12238 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012241 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012243 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244}
12245
Thomas Wouters477c8d52006-05-27 19:21:47 +000012246PyObject *
12247PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12248{
12249 PyObject* str_obj;
12250 PyObject* sep_obj;
12251 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 int kind1, kind2, kind;
12253 void *buf1 = NULL, *buf2 = NULL;
12254 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255
12256 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012257 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012259 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012260 if (!sep_obj) {
12261 Py_DECREF(str_obj);
12262 return NULL;
12263 }
12264 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12265 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012266 Py_DECREF(str_obj);
12267 return NULL;
12268 }
12269
Victor Stinner14f8f022011-10-05 20:58:25 +020012270 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012272 kind = Py_MAX(kind1, kind2);
12273 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012275 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 if (!buf1)
12277 goto onError;
12278 buf2 = PyUnicode_DATA(sep_obj);
12279 if (kind2 != kind)
12280 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12281 if (!buf2)
12282 goto onError;
12283 len1 = PyUnicode_GET_LENGTH(str_obj);
12284 len2 = PyUnicode_GET_LENGTH(sep_obj);
12285
Benjamin Petersonead6b532011-12-20 17:23:42 -060012286 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012288 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12289 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12290 else
12291 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 break;
12293 case PyUnicode_2BYTE_KIND:
12294 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12295 break;
12296 case PyUnicode_4BYTE_KIND:
12297 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12298 break;
12299 default:
12300 assert(0);
12301 out = 0;
12302 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012303
12304 Py_DECREF(sep_obj);
12305 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 if (kind1 != kind)
12307 PyMem_Free(buf1);
12308 if (kind2 != kind)
12309 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012310
12311 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 onError:
12313 Py_DECREF(sep_obj);
12314 Py_DECREF(str_obj);
12315 if (kind1 != kind && buf1)
12316 PyMem_Free(buf1);
12317 if (kind2 != kind && buf2)
12318 PyMem_Free(buf2);
12319 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012320}
12321
12322
12323PyObject *
12324PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12325{
12326 PyObject* str_obj;
12327 PyObject* sep_obj;
12328 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 int kind1, kind2, kind;
12330 void *buf1 = NULL, *buf2 = NULL;
12331 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012332
12333 str_obj = PyUnicode_FromObject(str_in);
12334 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012336 sep_obj = PyUnicode_FromObject(sep_in);
12337 if (!sep_obj) {
12338 Py_DECREF(str_obj);
12339 return NULL;
12340 }
12341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 kind1 = PyUnicode_KIND(str_in);
12343 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012344 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 buf1 = PyUnicode_DATA(str_in);
12346 if (kind1 != kind)
12347 buf1 = _PyUnicode_AsKind(str_in, kind);
12348 if (!buf1)
12349 goto onError;
12350 buf2 = PyUnicode_DATA(sep_obj);
12351 if (kind2 != kind)
12352 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12353 if (!buf2)
12354 goto onError;
12355 len1 = PyUnicode_GET_LENGTH(str_obj);
12356 len2 = PyUnicode_GET_LENGTH(sep_obj);
12357
Benjamin Petersonead6b532011-12-20 17:23:42 -060012358 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012360 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12361 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12362 else
12363 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 break;
12365 case PyUnicode_2BYTE_KIND:
12366 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12367 break;
12368 case PyUnicode_4BYTE_KIND:
12369 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12370 break;
12371 default:
12372 assert(0);
12373 out = 0;
12374 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012375
12376 Py_DECREF(sep_obj);
12377 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 if (kind1 != kind)
12379 PyMem_Free(buf1);
12380 if (kind2 != kind)
12381 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012382
12383 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 onError:
12385 Py_DECREF(sep_obj);
12386 Py_DECREF(str_obj);
12387 if (kind1 != kind && buf1)
12388 PyMem_Free(buf1);
12389 if (kind2 != kind && buf2)
12390 PyMem_Free(buf2);
12391 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012392}
12393
12394PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012396\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012397Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012398the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012399found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012400
12401static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012402unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012403{
Victor Stinner9310abb2011-10-05 00:59:23 +020012404 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012405}
12406
12407PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012408 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012409\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012410Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012411the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012412separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012413
12414static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012415unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012416{
Victor Stinner9310abb2011-10-05 00:59:23 +020012417 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012418}
12419
Alexander Belopolsky40018472011-02-26 01:02:56 +000012420PyObject *
12421PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012422{
12423 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012424
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012425 s = PyUnicode_FromObject(s);
12426 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012427 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012428 if (sep != NULL) {
12429 sep = PyUnicode_FromObject(sep);
12430 if (sep == NULL) {
12431 Py_DECREF(s);
12432 return NULL;
12433 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012434 }
12435
Victor Stinner9310abb2011-10-05 00:59:23 +020012436 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012437
12438 Py_DECREF(s);
12439 Py_XDECREF(sep);
12440 return result;
12441}
12442
12443PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012444 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012445\n\
12446Return a list of the words in S, using sep as the\n\
12447delimiter string, starting at the end of the string and\n\
12448working to the front. If maxsplit is given, at most maxsplit\n\
12449splits are done. If sep is not specified, any whitespace string\n\
12450is a separator.");
12451
12452static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012453unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012454{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012455 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012456 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012457 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012458
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012459 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12460 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012461 return NULL;
12462
12463 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012464 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012465 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012466 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012467 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012468 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012469}
12470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012471PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012472 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473\n\
12474Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012475Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012476is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477
12478static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012479unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012481 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012482 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012484 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12485 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486 return NULL;
12487
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012488 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489}
12490
12491static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012492PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012494 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495}
12496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012497PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499\n\
12500Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012501and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502
12503static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012504unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012506 if (PyUnicode_READY(self) == -1)
12507 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012508 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509}
12510
Georg Brandlceee0772007-11-27 23:48:05 +000012511PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012513\n\
12514Return a translation table usable for str.translate().\n\
12515If there is only one argument, it must be a dictionary mapping Unicode\n\
12516ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012517Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012518If there are two arguments, they must be strings of equal length, and\n\
12519in the resulting dictionary, each character in x will be mapped to the\n\
12520character at the same position in y. If there is a third argument, it\n\
12521must be a string, whose characters will be mapped to None in the result.");
12522
12523static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012524unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012525{
12526 PyObject *x, *y = NULL, *z = NULL;
12527 PyObject *new = NULL, *key, *value;
12528 Py_ssize_t i = 0;
12529 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012530
Georg Brandlceee0772007-11-27 23:48:05 +000012531 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12532 return NULL;
12533 new = PyDict_New();
12534 if (!new)
12535 return NULL;
12536 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 int x_kind, y_kind, z_kind;
12538 void *x_data, *y_data, *z_data;
12539
Georg Brandlceee0772007-11-27 23:48:05 +000012540 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012541 if (!PyUnicode_Check(x)) {
12542 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12543 "be a string if there is a second argument");
12544 goto err;
12545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012547 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12548 "arguments must have equal length");
12549 goto err;
12550 }
12551 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 x_kind = PyUnicode_KIND(x);
12553 y_kind = PyUnicode_KIND(y);
12554 x_data = PyUnicode_DATA(x);
12555 y_data = PyUnicode_DATA(y);
12556 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12557 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012558 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012559 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012560 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012561 if (!value) {
12562 Py_DECREF(key);
12563 goto err;
12564 }
Georg Brandlceee0772007-11-27 23:48:05 +000012565 res = PyDict_SetItem(new, key, value);
12566 Py_DECREF(key);
12567 Py_DECREF(value);
12568 if (res < 0)
12569 goto err;
12570 }
12571 /* create entries for deleting chars in z */
12572 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 z_kind = PyUnicode_KIND(z);
12574 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012575 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012577 if (!key)
12578 goto err;
12579 res = PyDict_SetItem(new, key, Py_None);
12580 Py_DECREF(key);
12581 if (res < 0)
12582 goto err;
12583 }
12584 }
12585 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 int kind;
12587 void *data;
12588
Georg Brandlceee0772007-11-27 23:48:05 +000012589 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012590 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012591 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12592 "to maketrans it must be a dict");
12593 goto err;
12594 }
12595 /* copy entries into the new dict, converting string keys to int keys */
12596 while (PyDict_Next(x, &i, &key, &value)) {
12597 if (PyUnicode_Check(key)) {
12598 /* convert string keys to integer keys */
12599 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012600 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012601 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12602 "table must be of length 1");
12603 goto err;
12604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 kind = PyUnicode_KIND(key);
12606 data = PyUnicode_DATA(key);
12607 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012608 if (!newkey)
12609 goto err;
12610 res = PyDict_SetItem(new, newkey, value);
12611 Py_DECREF(newkey);
12612 if (res < 0)
12613 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012614 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012615 /* just keep integer keys */
12616 if (PyDict_SetItem(new, key, value) < 0)
12617 goto err;
12618 } else {
12619 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12620 "be strings or integers");
12621 goto err;
12622 }
12623 }
12624 }
12625 return new;
12626 err:
12627 Py_DECREF(new);
12628 return NULL;
12629}
12630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012631PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012632 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633\n\
12634Return a copy of the string S, where all characters have been mapped\n\
12635through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012636Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012637Unmapped characters are left untouched. Characters mapped to None\n\
12638are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639
12640static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644}
12645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012646PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012647 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012649Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
12651static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012652unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012654 if (PyUnicode_READY(self) == -1)
12655 return NULL;
12656 if (PyUnicode_IS_ASCII(self))
12657 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012658 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659}
12660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012661PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012664Pad a numeric string S with zeros on the left, to fill a field\n\
12665of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666
12667static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012668unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012670 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012671 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012672 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 int kind;
12674 void *data;
12675 Py_UCS4 chr;
12676
Martin v. Löwis18e16552006-02-15 17:27:45 +000012677 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678 return NULL;
12679
Benjamin Petersonbac79492012-01-14 13:34:47 -050012680 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682
Victor Stinnerc4b49542011-12-11 22:44:26 +010012683 if (PyUnicode_GET_LENGTH(self) >= width)
12684 return unicode_result_unchanged(self);
12685
12686 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687
12688 u = pad(self, fill, 0, '0');
12689
Walter Dörwald068325e2002-04-15 13:36:47 +000012690 if (u == NULL)
12691 return NULL;
12692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 kind = PyUnicode_KIND(u);
12694 data = PyUnicode_DATA(u);
12695 chr = PyUnicode_READ(kind, data, fill);
12696
12697 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 PyUnicode_WRITE(kind, data, 0, chr);
12700 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701 }
12702
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012703 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012704 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706
12707#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012708static PyObject *
12709unicode__decimal2ascii(PyObject *self)
12710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012712}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713#endif
12714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012715PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012718Return True if S starts with the specified prefix, False otherwise.\n\
12719With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012720With optional end, stop comparing S at that position.\n\
12721prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722
12723static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012724unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012727 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012728 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012729 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012730 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012731 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732
Jesus Ceaac451502011-04-20 17:09:23 +020012733 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012734 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012735 if (PyTuple_Check(subobj)) {
12736 Py_ssize_t i;
12737 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012738 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012739 if (substring == NULL)
12740 return NULL;
12741 result = tailmatch(self, substring, start, end, -1);
12742 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012743 if (result == -1)
12744 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012745 if (result) {
12746 Py_RETURN_TRUE;
12747 }
12748 }
12749 /* nothing matched */
12750 Py_RETURN_FALSE;
12751 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012752 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012753 if (substring == NULL) {
12754 if (PyErr_ExceptionMatches(PyExc_TypeError))
12755 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12756 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012758 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012759 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012761 if (result == -1)
12762 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012763 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764}
12765
12766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012767PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012768 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012770Return True if S ends with the specified suffix, False otherwise.\n\
12771With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012772With optional end, stop comparing S at that position.\n\
12773suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774
12775static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012776unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012777 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012779 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012780 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012781 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012782 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012783 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784
Jesus Ceaac451502011-04-20 17:09:23 +020012785 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012786 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012787 if (PyTuple_Check(subobj)) {
12788 Py_ssize_t i;
12789 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012790 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012791 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012792 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012793 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012794 result = tailmatch(self, substring, start, end, +1);
12795 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012796 if (result == -1)
12797 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012798 if (result) {
12799 Py_RETURN_TRUE;
12800 }
12801 }
12802 Py_RETURN_FALSE;
12803 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012804 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012805 if (substring == NULL) {
12806 if (PyErr_ExceptionMatches(PyExc_TypeError))
12807 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12808 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012810 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012811 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010012812 if (result == -1)
12813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012815 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816}
12817
Victor Stinner202fdca2012-05-07 12:47:02 +020012818Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012819_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012820{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012821 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012822 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12823 writer->data = PyUnicode_DATA(writer->buffer);
12824 writer->kind = PyUnicode_KIND(writer->buffer);
12825}
12826
Victor Stinnerd3f08822012-05-29 12:57:52 +020012827void
12828_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012829{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012830 memset(writer, 0, sizeof(*writer));
12831#ifdef Py_DEBUG
12832 writer->kind = 5; /* invalid kind */
12833#endif
12834 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012835 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012836}
12837
Victor Stinnerd3f08822012-05-29 12:57:52 +020012838int
12839_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12840 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012841{
12842 Py_ssize_t newlen;
12843 PyObject *newbuffer;
12844
Victor Stinnerd3f08822012-05-29 12:57:52 +020012845 assert(length > 0);
12846
Victor Stinner202fdca2012-05-07 12:47:02 +020012847 if (length > PY_SSIZE_T_MAX - writer->pos) {
12848 PyErr_NoMemory();
12849 return -1;
12850 }
12851 newlen = writer->pos + length;
12852
Victor Stinnerd3f08822012-05-29 12:57:52 +020012853 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012854 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012855 /* overallocate 25% to limit the number of resize */
12856 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12857 newlen += newlen / 4;
12858 if (newlen < writer->min_length)
12859 newlen = writer->min_length;
12860 }
12861 writer->buffer = PyUnicode_New(newlen, maxchar);
12862 if (writer->buffer == NULL)
12863 return -1;
12864 _PyUnicodeWriter_Update(writer);
12865 return 0;
12866 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012867
Victor Stinnerd3f08822012-05-29 12:57:52 +020012868 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012869 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012870 /* overallocate 25% to limit the number of resize */
12871 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12872 newlen += newlen / 4;
12873 if (newlen < writer->min_length)
12874 newlen = writer->min_length;
12875 }
12876
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012877 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012878 /* resize + widen */
12879 newbuffer = PyUnicode_New(newlen, maxchar);
12880 if (newbuffer == NULL)
12881 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012882 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12883 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012884 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012885 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012886 }
12887 else {
12888 newbuffer = resize_compact(writer->buffer, newlen);
12889 if (newbuffer == NULL)
12890 return -1;
12891 }
12892 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012893 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012894 }
12895 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012896 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012897 newbuffer = PyUnicode_New(writer->size, maxchar);
12898 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012899 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012900 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12901 writer->buffer, 0, writer->pos);
12902 Py_DECREF(writer->buffer);
12903 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012904 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012905 }
12906 return 0;
12907}
12908
Victor Stinnerd3f08822012-05-29 12:57:52 +020012909int
12910_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12911{
12912 Py_UCS4 maxchar;
12913 Py_ssize_t len;
12914
12915 if (PyUnicode_READY(str) == -1)
12916 return -1;
12917 len = PyUnicode_GET_LENGTH(str);
12918 if (len == 0)
12919 return 0;
12920 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12921 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012922 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012923 Py_INCREF(str);
12924 writer->buffer = str;
12925 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012926 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012927 writer->size = 0;
12928 writer->pos += len;
12929 return 0;
12930 }
12931 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12932 return -1;
12933 }
12934 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12935 str, 0, len);
12936 writer->pos += len;
12937 return 0;
12938}
12939
Victor Stinnere215d962012-10-06 23:03:36 +020012940int
Victor Stinnercfc4c132013-04-03 01:48:39 +020012941_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
12942 Py_ssize_t start, Py_ssize_t end)
12943{
12944 Py_UCS4 maxchar;
12945 Py_ssize_t len;
12946
12947 if (PyUnicode_READY(str) == -1)
12948 return -1;
12949
12950 assert(0 <= start);
12951 assert(end <= PyUnicode_GET_LENGTH(str));
12952 assert(start <= end);
12953
12954 if (end == 0)
12955 return 0;
12956
12957 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
12958 return _PyUnicodeWriter_WriteStr(writer, str);
12959
12960 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
12961 maxchar = _PyUnicode_FindMaxChar(str, start, end);
12962 else
12963 maxchar = writer->maxchar;
12964 len = end - start;
12965
12966 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
12967 return -1;
12968
12969 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12970 str, start, len);
12971 writer->pos += len;
12972 return 0;
12973}
12974
12975int
Victor Stinnere215d962012-10-06 23:03:36 +020012976_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12977{
12978 Py_UCS4 maxchar;
12979
12980 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12981 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12982 return -1;
12983 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12984 writer->pos += len;
12985 return 0;
12986}
12987
Victor Stinnerd3f08822012-05-29 12:57:52 +020012988PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012989_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012990{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012991 if (writer->pos == 0) {
12992 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012993 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012994 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012995 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012996 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12997 return writer->buffer;
12998 }
12999 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13000 PyObject *newbuffer;
13001 newbuffer = resize_compact(writer->buffer, writer->pos);
13002 if (newbuffer == NULL) {
13003 Py_DECREF(writer->buffer);
13004 return NULL;
13005 }
13006 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013007 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020013008 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010013009 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013010}
13011
Victor Stinnerd3f08822012-05-29 12:57:52 +020013012void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013013_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013014{
13015 Py_CLEAR(writer->buffer);
13016}
13017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013019
13020PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013021 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013022\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013023Return a formatted version of S, using substitutions from args and kwargs.\n\
13024The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013025
Eric Smith27bbca62010-11-04 17:06:58 +000013026PyDoc_STRVAR(format_map__doc__,
13027 "S.format_map(mapping) -> str\n\
13028\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013029Return a formatted version of S, using substitutions from mapping.\n\
13030The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013031
Eric Smith4a7d76d2008-05-30 18:10:19 +000013032static PyObject *
13033unicode__format__(PyObject* self, PyObject* args)
13034{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013035 PyObject *format_spec;
13036 _PyUnicodeWriter writer;
13037 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013038
13039 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13040 return NULL;
13041
Victor Stinnerd3f08822012-05-29 12:57:52 +020013042 if (PyUnicode_READY(self) == -1)
13043 return NULL;
13044 _PyUnicodeWriter_Init(&writer, 0);
13045 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13046 self, format_spec, 0,
13047 PyUnicode_GET_LENGTH(format_spec));
13048 if (ret == -1) {
13049 _PyUnicodeWriter_Dealloc(&writer);
13050 return NULL;
13051 }
13052 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013053}
13054
Eric Smith8c663262007-08-25 02:26:07 +000013055PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013056 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013057\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013058Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013059
13060static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013061unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 Py_ssize_t size;
13064
13065 /* If it's a compact object, account for base structure +
13066 character data. */
13067 if (PyUnicode_IS_COMPACT_ASCII(v))
13068 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13069 else if (PyUnicode_IS_COMPACT(v))
13070 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013071 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 else {
13073 /* If it is a two-block object, account for base object, and
13074 for character block if present. */
13075 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013076 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013078 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013079 }
13080 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013081 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013082 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013083 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013084 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013085 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013086
13087 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013088}
13089
13090PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013092
13093static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013094unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013095{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013096 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013097 if (!copy)
13098 return NULL;
13099 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013100}
13101
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013103 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013104 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013105 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13106 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013107 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13108 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013109 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013110 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13111 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13112 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13113 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13114 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013115 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013116 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13117 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13118 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013119 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013120 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13121 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13122 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013123 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013124 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013125 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013126 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013127 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13128 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13129 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13130 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13131 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13132 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13133 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13134 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13135 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13136 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13137 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13138 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13139 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13140 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013141 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013142 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013143 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013144 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013145 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013146 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013147 {"maketrans", (PyCFunction) unicode_maketrans,
13148 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013149 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013150#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013151 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013152 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153#endif
13154
Benjamin Peterson14339b62009-01-31 16:36:08 +000013155 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156 {NULL, NULL}
13157};
13158
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013159static PyObject *
13160unicode_mod(PyObject *v, PyObject *w)
13161{
Brian Curtindfc80e32011-08-10 20:28:54 -050013162 if (!PyUnicode_Check(v))
13163 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013165}
13166
13167static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013168 0, /*nb_add*/
13169 0, /*nb_subtract*/
13170 0, /*nb_multiply*/
13171 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013172};
13173
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013175 (lenfunc) unicode_length, /* sq_length */
13176 PyUnicode_Concat, /* sq_concat */
13177 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13178 (ssizeargfunc) unicode_getitem, /* sq_item */
13179 0, /* sq_slice */
13180 0, /* sq_ass_item */
13181 0, /* sq_ass_slice */
13182 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183};
13184
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013185static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013186unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013188 if (PyUnicode_READY(self) == -1)
13189 return NULL;
13190
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013191 if (PyIndex_Check(item)) {
13192 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013193 if (i == -1 && PyErr_Occurred())
13194 return NULL;
13195 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013197 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013198 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013199 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013200 PyObject *result;
13201 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013202 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013203 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013207 return NULL;
13208 }
13209
13210 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013211 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013213 slicelength == PyUnicode_GET_LENGTH(self)) {
13214 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013215 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013216 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013217 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013218 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013219 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013220 src_kind = PyUnicode_KIND(self);
13221 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013222 if (!PyUnicode_IS_ASCII(self)) {
13223 kind_limit = kind_maxchar_limit(src_kind);
13224 max_char = 0;
13225 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13226 ch = PyUnicode_READ(src_kind, src_data, cur);
13227 if (ch > max_char) {
13228 max_char = ch;
13229 if (max_char >= kind_limit)
13230 break;
13231 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013232 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013233 }
Victor Stinner55c99112011-10-13 01:17:06 +020013234 else
13235 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013236 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013237 if (result == NULL)
13238 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013239 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013240 dest_data = PyUnicode_DATA(result);
13241
13242 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013243 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13244 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013245 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013246 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013247 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013248 } else {
13249 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13250 return NULL;
13251 }
13252}
13253
13254static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013255 (lenfunc)unicode_length, /* mp_length */
13256 (binaryfunc)unicode_subscript, /* mp_subscript */
13257 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013258};
13259
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261/* Helpers for PyUnicode_Format() */
13262
Victor Stinnera47082312012-10-04 02:19:54 +020013263struct unicode_formatter_t {
13264 PyObject *args;
13265 int args_owned;
13266 Py_ssize_t arglen, argidx;
13267 PyObject *dict;
13268
13269 enum PyUnicode_Kind fmtkind;
13270 Py_ssize_t fmtcnt, fmtpos;
13271 void *fmtdata;
13272 PyObject *fmtstr;
13273
13274 _PyUnicodeWriter writer;
13275};
13276
13277struct unicode_format_arg_t {
13278 Py_UCS4 ch;
13279 int flags;
13280 Py_ssize_t width;
13281 int prec;
13282 int sign;
13283};
13284
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013286unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287{
Victor Stinnera47082312012-10-04 02:19:54 +020013288 Py_ssize_t argidx = ctx->argidx;
13289
13290 if (argidx < ctx->arglen) {
13291 ctx->argidx++;
13292 if (ctx->arglen < 0)
13293 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013294 else
Victor Stinnera47082312012-10-04 02:19:54 +020013295 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296 }
13297 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013298 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299 return NULL;
13300}
13301
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013302/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303
Victor Stinnera47082312012-10-04 02:19:54 +020013304/* Format a float into the writer if the writer is not NULL, or into *p_output
13305 otherwise.
13306
13307 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013308static int
Victor Stinnera47082312012-10-04 02:19:54 +020013309formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13310 PyObject **p_output,
13311 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013313 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013315 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013316 int prec;
13317 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013318
Guido van Rossumd57fd912000-03-10 22:53:23 +000013319 x = PyFloat_AsDouble(v);
13320 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013321 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013322
Victor Stinnera47082312012-10-04 02:19:54 +020013323 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013325 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013326
Victor Stinnera47082312012-10-04 02:19:54 +020013327 if (arg->flags & F_ALT)
13328 dtoa_flags = Py_DTSF_ALT;
13329 else
13330 dtoa_flags = 0;
13331 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013332 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013333 return -1;
13334 len = strlen(p);
13335 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013336 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13337 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013338 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013339 }
Victor Stinner184252a2012-06-16 02:57:41 +020013340 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013341 writer->pos += len;
13342 }
13343 else
13344 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013345 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013346 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347}
13348
Victor Stinnerd0880d52012-04-27 23:40:13 +020013349/* formatlong() emulates the format codes d, u, o, x and X, and
13350 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13351 * Python's regular ints.
13352 * Return value: a new PyUnicodeObject*, or NULL if error.
13353 * The output string is of the form
13354 * "-"? ("0x" | "0X")? digit+
13355 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13356 * set in flags. The case of hex digits will be correct,
13357 * There will be at least prec digits, zero-filled on the left if
13358 * necessary to get that many.
13359 * val object to be converted
13360 * flags bitmask of format flags; only F_ALT is looked at
13361 * prec minimum number of digits; 0-fill on left if needed
13362 * type a character in [duoxX]; u acts the same as d
13363 *
13364 * CAUTION: o, x and X conversions on regular ints can never
13365 * produce a '-' sign, but can for Python's unbounded ints.
13366 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013367static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013368formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013369{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013370 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013371 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013372 Py_ssize_t i;
13373 int sign; /* 1 if '-', else 0 */
13374 int len; /* number of characters */
13375 Py_ssize_t llen;
13376 int numdigits; /* len == numnondigits + numdigits */
13377 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013378 int prec = arg->prec;
13379 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013380
Victor Stinnerd0880d52012-04-27 23:40:13 +020013381 /* Avoid exceeding SSIZE_T_MAX */
13382 if (prec > INT_MAX-3) {
13383 PyErr_SetString(PyExc_OverflowError,
13384 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013385 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013386 }
13387
13388 assert(PyLong_Check(val));
13389
13390 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013391 default:
13392 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013393 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013394 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013395 case 'u':
13396 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013397 if (PyBool_Check(val))
13398 result = PyNumber_ToBase(val, 10);
13399 else
13400 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013401 break;
13402 case 'o':
13403 numnondigits = 2;
13404 result = PyNumber_ToBase(val, 8);
13405 break;
13406 case 'x':
13407 case 'X':
13408 numnondigits = 2;
13409 result = PyNumber_ToBase(val, 16);
13410 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013411 }
13412 if (!result)
13413 return NULL;
13414
13415 assert(unicode_modifiable(result));
13416 assert(PyUnicode_IS_READY(result));
13417 assert(PyUnicode_IS_ASCII(result));
13418
13419 /* To modify the string in-place, there can only be one reference. */
13420 if (Py_REFCNT(result) != 1) {
13421 PyErr_BadInternalCall();
13422 return NULL;
13423 }
13424 buf = PyUnicode_DATA(result);
13425 llen = PyUnicode_GET_LENGTH(result);
13426 if (llen > INT_MAX) {
13427 PyErr_SetString(PyExc_ValueError,
13428 "string too large in _PyBytes_FormatLong");
13429 return NULL;
13430 }
13431 len = (int)llen;
13432 sign = buf[0] == '-';
13433 numnondigits += sign;
13434 numdigits = len - numnondigits;
13435 assert(numdigits > 0);
13436
13437 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013438 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013439 (type == 'o' || type == 'x' || type == 'X'))) {
13440 assert(buf[sign] == '0');
13441 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13442 buf[sign+1] == 'o');
13443 numnondigits -= 2;
13444 buf += 2;
13445 len -= 2;
13446 if (sign)
13447 buf[0] = '-';
13448 assert(len == numnondigits + numdigits);
13449 assert(numdigits > 0);
13450 }
13451
13452 /* Fill with leading zeroes to meet minimum width. */
13453 if (prec > numdigits) {
13454 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13455 numnondigits + prec);
13456 char *b1;
13457 if (!r1) {
13458 Py_DECREF(result);
13459 return NULL;
13460 }
13461 b1 = PyBytes_AS_STRING(r1);
13462 for (i = 0; i < numnondigits; ++i)
13463 *b1++ = *buf++;
13464 for (i = 0; i < prec - numdigits; i++)
13465 *b1++ = '0';
13466 for (i = 0; i < numdigits; i++)
13467 *b1++ = *buf++;
13468 *b1 = '\0';
13469 Py_DECREF(result);
13470 result = r1;
13471 buf = PyBytes_AS_STRING(result);
13472 len = numnondigits + prec;
13473 }
13474
13475 /* Fix up case for hex conversions. */
13476 if (type == 'X') {
13477 /* Need to convert all lower case letters to upper case.
13478 and need to convert 0x to 0X (and -0x to -0X). */
13479 for (i = 0; i < len; i++)
13480 if (buf[i] >= 'a' && buf[i] <= 'x')
13481 buf[i] -= 'a'-'A';
13482 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013483 if (!PyUnicode_Check(result)
13484 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013485 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013486 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013487 Py_DECREF(result);
13488 result = unicode;
13489 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013490 else if (len != PyUnicode_GET_LENGTH(result)) {
13491 if (PyUnicode_Resize(&result, len) < 0)
13492 Py_CLEAR(result);
13493 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013494 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013495}
13496
Victor Stinner621ef3d2012-10-02 00:33:47 +020013497/* Format an integer.
13498 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013499 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013500 * -1 and raise an exception on error */
13501static int
Victor Stinnera47082312012-10-04 02:19:54 +020013502mainformatlong(PyObject *v,
13503 struct unicode_format_arg_t *arg,
13504 PyObject **p_output,
13505 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013506{
13507 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013508 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013509
13510 if (!PyNumber_Check(v))
13511 goto wrongtype;
13512
13513 if (!PyLong_Check(v)) {
13514 iobj = PyNumber_Long(v);
13515 if (iobj == NULL) {
13516 if (PyErr_ExceptionMatches(PyExc_TypeError))
13517 goto wrongtype;
13518 return -1;
13519 }
13520 assert(PyLong_Check(iobj));
13521 }
13522 else {
13523 iobj = v;
13524 Py_INCREF(iobj);
13525 }
13526
13527 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013528 && arg->width == -1 && arg->prec == -1
13529 && !(arg->flags & (F_SIGN | F_BLANK))
13530 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013531 {
13532 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013533 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013534 int base;
13535
Victor Stinnera47082312012-10-04 02:19:54 +020013536 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013537 {
13538 default:
13539 assert(0 && "'type' not in [diuoxX]");
13540 case 'd':
13541 case 'i':
13542 case 'u':
13543 base = 10;
13544 break;
13545 case 'o':
13546 base = 8;
13547 break;
13548 case 'x':
13549 case 'X':
13550 base = 16;
13551 break;
13552 }
13553
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013554 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13555 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013556 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013557 }
13558 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013559 return 1;
13560 }
13561
Victor Stinnera47082312012-10-04 02:19:54 +020013562 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013563 Py_DECREF(iobj);
13564 if (res == NULL)
13565 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013566 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013567 return 0;
13568
13569wrongtype:
13570 PyErr_Format(PyExc_TypeError,
13571 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013572 "not %.200s",
13573 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013574 return -1;
13575}
13576
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013577static Py_UCS4
13578formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013579{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013580 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013581 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013582 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013583 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 goto onError;
13586 }
13587 else {
13588 /* Integer input truncated to a character */
13589 long x;
13590 x = PyLong_AsLong(v);
13591 if (x == -1 && PyErr_Occurred())
13592 goto onError;
13593
Victor Stinner8faf8212011-12-08 22:14:11 +010013594 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013595 PyErr_SetString(PyExc_OverflowError,
13596 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013597 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 }
13599
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013600 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013601 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013602
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013604 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013606 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013607}
13608
Victor Stinnera47082312012-10-04 02:19:54 +020013609/* Parse options of an argument: flags, width, precision.
13610 Handle also "%(name)" syntax.
13611
13612 Return 0 if the argument has been formatted into arg->str.
13613 Return 1 if the argument has been written into ctx->writer,
13614 Raise an exception and return -1 on error. */
13615static int
13616unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13617 struct unicode_format_arg_t *arg)
13618{
13619#define FORMAT_READ(ctx) \
13620 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13621
13622 PyObject *v;
13623
Victor Stinnera47082312012-10-04 02:19:54 +020013624 if (arg->ch == '(') {
13625 /* Get argument value from a dictionary. Example: "%(name)s". */
13626 Py_ssize_t keystart;
13627 Py_ssize_t keylen;
13628 PyObject *key;
13629 int pcount = 1;
13630
13631 if (ctx->dict == NULL) {
13632 PyErr_SetString(PyExc_TypeError,
13633 "format requires a mapping");
13634 return -1;
13635 }
13636 ++ctx->fmtpos;
13637 --ctx->fmtcnt;
13638 keystart = ctx->fmtpos;
13639 /* Skip over balanced parentheses */
13640 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13641 arg->ch = FORMAT_READ(ctx);
13642 if (arg->ch == ')')
13643 --pcount;
13644 else if (arg->ch == '(')
13645 ++pcount;
13646 ctx->fmtpos++;
13647 }
13648 keylen = ctx->fmtpos - keystart - 1;
13649 if (ctx->fmtcnt < 0 || pcount > 0) {
13650 PyErr_SetString(PyExc_ValueError,
13651 "incomplete format key");
13652 return -1;
13653 }
13654 key = PyUnicode_Substring(ctx->fmtstr,
13655 keystart, keystart + keylen);
13656 if (key == NULL)
13657 return -1;
13658 if (ctx->args_owned) {
13659 Py_DECREF(ctx->args);
13660 ctx->args_owned = 0;
13661 }
13662 ctx->args = PyObject_GetItem(ctx->dict, key);
13663 Py_DECREF(key);
13664 if (ctx->args == NULL)
13665 return -1;
13666 ctx->args_owned = 1;
13667 ctx->arglen = -1;
13668 ctx->argidx = -2;
13669 }
13670
13671 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013672 while (--ctx->fmtcnt >= 0) {
13673 arg->ch = FORMAT_READ(ctx);
13674 ctx->fmtpos++;
13675 switch (arg->ch) {
13676 case '-': arg->flags |= F_LJUST; continue;
13677 case '+': arg->flags |= F_SIGN; continue;
13678 case ' ': arg->flags |= F_BLANK; continue;
13679 case '#': arg->flags |= F_ALT; continue;
13680 case '0': arg->flags |= F_ZERO; continue;
13681 }
13682 break;
13683 }
13684
13685 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013686 if (arg->ch == '*') {
13687 v = unicode_format_getnextarg(ctx);
13688 if (v == NULL)
13689 return -1;
13690 if (!PyLong_Check(v)) {
13691 PyErr_SetString(PyExc_TypeError,
13692 "* wants int");
13693 return -1;
13694 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013695 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013696 if (arg->width == -1 && PyErr_Occurred())
13697 return -1;
13698 if (arg->width < 0) {
13699 arg->flags |= F_LJUST;
13700 arg->width = -arg->width;
13701 }
13702 if (--ctx->fmtcnt >= 0) {
13703 arg->ch = FORMAT_READ(ctx);
13704 ctx->fmtpos++;
13705 }
13706 }
13707 else if (arg->ch >= '0' && arg->ch <= '9') {
13708 arg->width = arg->ch - '0';
13709 while (--ctx->fmtcnt >= 0) {
13710 arg->ch = FORMAT_READ(ctx);
13711 ctx->fmtpos++;
13712 if (arg->ch < '0' || arg->ch > '9')
13713 break;
13714 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13715 mixing signed and unsigned comparison. Since arg->ch is between
13716 '0' and '9', casting to int is safe. */
13717 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13718 PyErr_SetString(PyExc_ValueError,
13719 "width too big");
13720 return -1;
13721 }
13722 arg->width = arg->width*10 + (arg->ch - '0');
13723 }
13724 }
13725
13726 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013727 if (arg->ch == '.') {
13728 arg->prec = 0;
13729 if (--ctx->fmtcnt >= 0) {
13730 arg->ch = FORMAT_READ(ctx);
13731 ctx->fmtpos++;
13732 }
13733 if (arg->ch == '*') {
13734 v = unicode_format_getnextarg(ctx);
13735 if (v == NULL)
13736 return -1;
13737 if (!PyLong_Check(v)) {
13738 PyErr_SetString(PyExc_TypeError,
13739 "* wants int");
13740 return -1;
13741 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013742 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013743 if (arg->prec == -1 && PyErr_Occurred())
13744 return -1;
13745 if (arg->prec < 0)
13746 arg->prec = 0;
13747 if (--ctx->fmtcnt >= 0) {
13748 arg->ch = FORMAT_READ(ctx);
13749 ctx->fmtpos++;
13750 }
13751 }
13752 else if (arg->ch >= '0' && arg->ch <= '9') {
13753 arg->prec = arg->ch - '0';
13754 while (--ctx->fmtcnt >= 0) {
13755 arg->ch = FORMAT_READ(ctx);
13756 ctx->fmtpos++;
13757 if (arg->ch < '0' || arg->ch > '9')
13758 break;
13759 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13760 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013761 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013762 return -1;
13763 }
13764 arg->prec = arg->prec*10 + (arg->ch - '0');
13765 }
13766 }
13767 }
13768
13769 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13770 if (ctx->fmtcnt >= 0) {
13771 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13772 if (--ctx->fmtcnt >= 0) {
13773 arg->ch = FORMAT_READ(ctx);
13774 ctx->fmtpos++;
13775 }
13776 }
13777 }
13778 if (ctx->fmtcnt < 0) {
13779 PyErr_SetString(PyExc_ValueError,
13780 "incomplete format");
13781 return -1;
13782 }
13783 return 0;
13784
13785#undef FORMAT_READ
13786}
13787
13788/* Format one argument. Supported conversion specifiers:
13789
13790 - "s", "r", "a": any type
13791 - "i", "d", "u", "o", "x", "X": int
13792 - "e", "E", "f", "F", "g", "G": float
13793 - "c": int or str (1 character)
13794
Victor Stinner8dbd4212012-12-04 09:30:24 +010013795 When possible, the output is written directly into the Unicode writer
13796 (ctx->writer). A string is created when padding is required.
13797
Victor Stinnera47082312012-10-04 02:19:54 +020013798 Return 0 if the argument has been formatted into *p_str,
13799 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013800 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013801static int
13802unicode_format_arg_format(struct unicode_formatter_t *ctx,
13803 struct unicode_format_arg_t *arg,
13804 PyObject **p_str)
13805{
13806 PyObject *v;
13807 _PyUnicodeWriter *writer = &ctx->writer;
13808
13809 if (ctx->fmtcnt == 0)
13810 ctx->writer.overallocate = 0;
13811
13812 if (arg->ch == '%') {
13813 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13814 return -1;
13815 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13816 writer->pos += 1;
13817 return 1;
13818 }
13819
13820 v = unicode_format_getnextarg(ctx);
13821 if (v == NULL)
13822 return -1;
13823
Victor Stinnera47082312012-10-04 02:19:54 +020013824
13825 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013826 case 's':
13827 case 'r':
13828 case 'a':
13829 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13830 /* Fast path */
13831 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13832 return -1;
13833 return 1;
13834 }
13835
13836 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13837 *p_str = v;
13838 Py_INCREF(*p_str);
13839 }
13840 else {
13841 if (arg->ch == 's')
13842 *p_str = PyObject_Str(v);
13843 else if (arg->ch == 'r')
13844 *p_str = PyObject_Repr(v);
13845 else
13846 *p_str = PyObject_ASCII(v);
13847 }
13848 break;
13849
13850 case 'i':
13851 case 'd':
13852 case 'u':
13853 case 'o':
13854 case 'x':
13855 case 'X':
13856 {
13857 int ret = mainformatlong(v, arg, p_str, writer);
13858 if (ret != 0)
13859 return ret;
13860 arg->sign = 1;
13861 break;
13862 }
13863
13864 case 'e':
13865 case 'E':
13866 case 'f':
13867 case 'F':
13868 case 'g':
13869 case 'G':
13870 if (arg->width == -1 && arg->prec == -1
13871 && !(arg->flags & (F_SIGN | F_BLANK)))
13872 {
13873 /* Fast path */
13874 if (formatfloat(v, arg, NULL, writer) == -1)
13875 return -1;
13876 return 1;
13877 }
13878
13879 arg->sign = 1;
13880 if (formatfloat(v, arg, p_str, NULL) == -1)
13881 return -1;
13882 break;
13883
13884 case 'c':
13885 {
13886 Py_UCS4 ch = formatchar(v);
13887 if (ch == (Py_UCS4) -1)
13888 return -1;
13889 if (arg->width == -1 && arg->prec == -1) {
13890 /* Fast path */
13891 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13892 return -1;
13893 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13894 writer->pos += 1;
13895 return 1;
13896 }
13897 *p_str = PyUnicode_FromOrdinal(ch);
13898 break;
13899 }
13900
13901 default:
13902 PyErr_Format(PyExc_ValueError,
13903 "unsupported format character '%c' (0x%x) "
13904 "at index %zd",
13905 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13906 (int)arg->ch,
13907 ctx->fmtpos - 1);
13908 return -1;
13909 }
13910 if (*p_str == NULL)
13911 return -1;
13912 assert (PyUnicode_Check(*p_str));
13913 return 0;
13914}
13915
13916static int
13917unicode_format_arg_output(struct unicode_formatter_t *ctx,
13918 struct unicode_format_arg_t *arg,
13919 PyObject *str)
13920{
13921 Py_ssize_t len;
13922 enum PyUnicode_Kind kind;
13923 void *pbuf;
13924 Py_ssize_t pindex;
13925 Py_UCS4 signchar;
13926 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013927 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020013928 Py_ssize_t sublen;
13929 _PyUnicodeWriter *writer = &ctx->writer;
13930 Py_UCS4 fill;
13931
13932 fill = ' ';
13933 if (arg->sign && arg->flags & F_ZERO)
13934 fill = '0';
13935
13936 if (PyUnicode_READY(str) == -1)
13937 return -1;
13938
13939 len = PyUnicode_GET_LENGTH(str);
13940 if ((arg->width == -1 || arg->width <= len)
13941 && (arg->prec == -1 || arg->prec >= len)
13942 && !(arg->flags & (F_SIGN | F_BLANK)))
13943 {
13944 /* Fast path */
13945 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13946 return -1;
13947 return 0;
13948 }
13949
13950 /* Truncate the string for "s", "r" and "a" formats
13951 if the precision is set */
13952 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13953 if (arg->prec >= 0 && len > arg->prec)
13954 len = arg->prec;
13955 }
13956
13957 /* Adjust sign and width */
13958 kind = PyUnicode_KIND(str);
13959 pbuf = PyUnicode_DATA(str);
13960 pindex = 0;
13961 signchar = '\0';
13962 if (arg->sign) {
13963 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13964 if (ch == '-' || ch == '+') {
13965 signchar = ch;
13966 len--;
13967 pindex++;
13968 }
13969 else if (arg->flags & F_SIGN)
13970 signchar = '+';
13971 else if (arg->flags & F_BLANK)
13972 signchar = ' ';
13973 else
13974 arg->sign = 0;
13975 }
13976 if (arg->width < len)
13977 arg->width = len;
13978
13979 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013980 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020013981 if (!(arg->flags & F_LJUST)) {
13982 if (arg->sign) {
13983 if ((arg->width-1) > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013984 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020013985 }
13986 else {
13987 if (arg->width > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013988 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020013989 }
13990 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013991 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
13992 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13993 maxchar = MAX_MAXCHAR(maxchar, strmaxchar);
13994 }
13995
Victor Stinnera47082312012-10-04 02:19:54 +020013996 buflen = arg->width;
13997 if (arg->sign && len == arg->width)
13998 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013999 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014000 return -1;
14001
14002 /* Write the sign if needed */
14003 if (arg->sign) {
14004 if (fill != ' ') {
14005 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14006 writer->pos += 1;
14007 }
14008 if (arg->width > len)
14009 arg->width--;
14010 }
14011
14012 /* Write the numeric prefix for "x", "X" and "o" formats
14013 if the alternate form is used.
14014 For example, write "0x" for the "%#x" format. */
14015 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14016 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14017 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14018 if (fill != ' ') {
14019 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14020 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14021 writer->pos += 2;
14022 pindex += 2;
14023 }
14024 arg->width -= 2;
14025 if (arg->width < 0)
14026 arg->width = 0;
14027 len -= 2;
14028 }
14029
14030 /* Pad left with the fill character if needed */
14031 if (arg->width > len && !(arg->flags & F_LJUST)) {
14032 sublen = arg->width - len;
14033 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14034 writer->pos += sublen;
14035 arg->width = len;
14036 }
14037
14038 /* If padding with spaces: write sign if needed and/or numeric prefix if
14039 the alternate form is used */
14040 if (fill == ' ') {
14041 if (arg->sign) {
14042 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14043 writer->pos += 1;
14044 }
14045 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14046 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14047 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14048 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14049 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14050 writer->pos += 2;
14051 pindex += 2;
14052 }
14053 }
14054
14055 /* Write characters */
14056 if (len) {
14057 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14058 str, pindex, len);
14059 writer->pos += len;
14060 }
14061
14062 /* Pad right with the fill character if needed */
14063 if (arg->width > len) {
14064 sublen = arg->width - len;
14065 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14066 writer->pos += sublen;
14067 }
14068 return 0;
14069}
14070
14071/* Helper of PyUnicode_Format(): format one arg.
14072 Return 0 on success, raise an exception and return -1 on error. */
14073static int
14074unicode_format_arg(struct unicode_formatter_t *ctx)
14075{
14076 struct unicode_format_arg_t arg;
14077 PyObject *str;
14078 int ret;
14079
Victor Stinner8dbd4212012-12-04 09:30:24 +010014080 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14081 arg.flags = 0;
14082 arg.width = -1;
14083 arg.prec = -1;
14084 arg.sign = 0;
14085 str = NULL;
14086
Victor Stinnera47082312012-10-04 02:19:54 +020014087 ret = unicode_format_arg_parse(ctx, &arg);
14088 if (ret == -1)
14089 return -1;
14090
14091 ret = unicode_format_arg_format(ctx, &arg, &str);
14092 if (ret == -1)
14093 return -1;
14094
14095 if (ret != 1) {
14096 ret = unicode_format_arg_output(ctx, &arg, str);
14097 Py_DECREF(str);
14098 if (ret == -1)
14099 return -1;
14100 }
14101
14102 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14103 PyErr_SetString(PyExc_TypeError,
14104 "not all arguments converted during string formatting");
14105 return -1;
14106 }
14107 return 0;
14108}
14109
Alexander Belopolsky40018472011-02-26 01:02:56 +000014110PyObject *
14111PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014112{
Victor Stinnera47082312012-10-04 02:19:54 +020014113 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014114
Guido van Rossumd57fd912000-03-10 22:53:23 +000014115 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014116 PyErr_BadInternalCall();
14117 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014118 }
Victor Stinnera47082312012-10-04 02:19:54 +020014119
14120 ctx.fmtstr = PyUnicode_FromObject(format);
14121 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014122 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014123 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14124 Py_DECREF(ctx.fmtstr);
14125 return NULL;
14126 }
14127 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14128 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14129 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14130 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014131
Victor Stinnera47082312012-10-04 02:19:54 +020014132 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014133
Guido van Rossumd57fd912000-03-10 22:53:23 +000014134 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014135 ctx.arglen = PyTuple_Size(args);
14136 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014137 }
14138 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014139 ctx.arglen = -1;
14140 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014141 }
Victor Stinnera47082312012-10-04 02:19:54 +020014142 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014143 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014144 ctx.dict = args;
14145 else
14146 ctx.dict = NULL;
14147 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014148
Victor Stinnera47082312012-10-04 02:19:54 +020014149 while (--ctx.fmtcnt >= 0) {
14150 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014151 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014152
14153 nonfmtpos = ctx.fmtpos++;
14154 while (ctx.fmtcnt >= 0 &&
14155 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14156 ctx.fmtpos++;
14157 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 }
Victor Stinnera47082312012-10-04 02:19:54 +020014159 if (ctx.fmtcnt < 0) {
14160 ctx.fmtpos--;
14161 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014162 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014163
Victor Stinnercfc4c132013-04-03 01:48:39 +020014164 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14165 nonfmtpos, ctx.fmtpos) < 0)
14166 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 }
14168 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014169 ctx.fmtpos++;
14170 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014171 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014172 }
14173 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014174
Victor Stinnera47082312012-10-04 02:19:54 +020014175 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014176 PyErr_SetString(PyExc_TypeError,
14177 "not all arguments converted during string formatting");
14178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014179 }
14180
Victor Stinnera47082312012-10-04 02:19:54 +020014181 if (ctx.args_owned) {
14182 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014183 }
Victor Stinnera47082312012-10-04 02:19:54 +020014184 Py_DECREF(ctx.fmtstr);
14185 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186
Benjamin Peterson29060642009-01-31 22:14:21 +000014187 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014188 Py_DECREF(ctx.fmtstr);
14189 _PyUnicodeWriter_Dealloc(&ctx.writer);
14190 if (ctx.args_owned) {
14191 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014192 }
14193 return NULL;
14194}
14195
Jeremy Hylton938ace62002-07-17 16:30:39 +000014196static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014197unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14198
Tim Peters6d6c1a32001-08-02 04:15:00 +000014199static PyObject *
14200unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14201{
Benjamin Peterson29060642009-01-31 22:14:21 +000014202 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014203 static char *kwlist[] = {"object", "encoding", "errors", 0};
14204 char *encoding = NULL;
14205 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014206
Benjamin Peterson14339b62009-01-31 16:36:08 +000014207 if (type != &PyUnicode_Type)
14208 return unicode_subtype_new(type, args, kwds);
14209 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014210 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014211 return NULL;
14212 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014213 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014214 if (encoding == NULL && errors == NULL)
14215 return PyObject_Str(x);
14216 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014217 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014218}
14219
Guido van Rossume023fe02001-08-30 03:12:59 +000014220static PyObject *
14221unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14222{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014223 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014224 Py_ssize_t length, char_size;
14225 int share_wstr, share_utf8;
14226 unsigned int kind;
14227 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014228
Benjamin Peterson14339b62009-01-31 16:36:08 +000014229 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014230
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014231 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014232 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014233 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014234 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014235 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014236 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014237 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014238 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014239
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014240 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014241 if (self == NULL) {
14242 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014243 return NULL;
14244 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014245 kind = PyUnicode_KIND(unicode);
14246 length = PyUnicode_GET_LENGTH(unicode);
14247
14248 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014249#ifdef Py_DEBUG
14250 _PyUnicode_HASH(self) = -1;
14251#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014252 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014253#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014254 _PyUnicode_STATE(self).interned = 0;
14255 _PyUnicode_STATE(self).kind = kind;
14256 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014257 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014258 _PyUnicode_STATE(self).ready = 1;
14259 _PyUnicode_WSTR(self) = NULL;
14260 _PyUnicode_UTF8_LENGTH(self) = 0;
14261 _PyUnicode_UTF8(self) = NULL;
14262 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014263 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014264
14265 share_utf8 = 0;
14266 share_wstr = 0;
14267 if (kind == PyUnicode_1BYTE_KIND) {
14268 char_size = 1;
14269 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14270 share_utf8 = 1;
14271 }
14272 else if (kind == PyUnicode_2BYTE_KIND) {
14273 char_size = 2;
14274 if (sizeof(wchar_t) == 2)
14275 share_wstr = 1;
14276 }
14277 else {
14278 assert(kind == PyUnicode_4BYTE_KIND);
14279 char_size = 4;
14280 if (sizeof(wchar_t) == 4)
14281 share_wstr = 1;
14282 }
14283
14284 /* Ensure we won't overflow the length. */
14285 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14286 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014287 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014288 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014289 data = PyObject_MALLOC((length + 1) * char_size);
14290 if (data == NULL) {
14291 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014292 goto onError;
14293 }
14294
Victor Stinnerc3c74152011-10-02 20:39:55 +020014295 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014296 if (share_utf8) {
14297 _PyUnicode_UTF8_LENGTH(self) = length;
14298 _PyUnicode_UTF8(self) = data;
14299 }
14300 if (share_wstr) {
14301 _PyUnicode_WSTR_LENGTH(self) = length;
14302 _PyUnicode_WSTR(self) = (wchar_t *)data;
14303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014304
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014305 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014306 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014307 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014308#ifdef Py_DEBUG
14309 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14310#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014311 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014312 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014313
14314onError:
14315 Py_DECREF(unicode);
14316 Py_DECREF(self);
14317 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014318}
14319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014320PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014321"str(object='') -> str\n\
14322str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014323\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014324Create a new string object from the given object. If encoding or\n\
14325errors is specified, then the object must expose a data buffer\n\
14326that will be decoded using the given encoding and error handler.\n\
14327Otherwise, returns the result of object.__str__() (if defined)\n\
14328or repr(object).\n\
14329encoding defaults to sys.getdefaultencoding().\n\
14330errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014331
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014332static PyObject *unicode_iter(PyObject *seq);
14333
Guido van Rossumd57fd912000-03-10 22:53:23 +000014334PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014335 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014336 "str", /* tp_name */
14337 sizeof(PyUnicodeObject), /* tp_size */
14338 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014339 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014340 (destructor)unicode_dealloc, /* tp_dealloc */
14341 0, /* tp_print */
14342 0, /* tp_getattr */
14343 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014344 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 unicode_repr, /* tp_repr */
14346 &unicode_as_number, /* tp_as_number */
14347 &unicode_as_sequence, /* tp_as_sequence */
14348 &unicode_as_mapping, /* tp_as_mapping */
14349 (hashfunc) unicode_hash, /* tp_hash*/
14350 0, /* tp_call*/
14351 (reprfunc) unicode_str, /* tp_str */
14352 PyObject_GenericGetAttr, /* tp_getattro */
14353 0, /* tp_setattro */
14354 0, /* tp_as_buffer */
14355 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014356 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 unicode_doc, /* tp_doc */
14358 0, /* tp_traverse */
14359 0, /* tp_clear */
14360 PyUnicode_RichCompare, /* tp_richcompare */
14361 0, /* tp_weaklistoffset */
14362 unicode_iter, /* tp_iter */
14363 0, /* tp_iternext */
14364 unicode_methods, /* tp_methods */
14365 0, /* tp_members */
14366 0, /* tp_getset */
14367 &PyBaseObject_Type, /* tp_base */
14368 0, /* tp_dict */
14369 0, /* tp_descr_get */
14370 0, /* tp_descr_set */
14371 0, /* tp_dictoffset */
14372 0, /* tp_init */
14373 0, /* tp_alloc */
14374 unicode_new, /* tp_new */
14375 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014376};
14377
14378/* Initialize the Unicode implementation */
14379
Victor Stinner3a50e702011-10-18 21:21:00 +020014380int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014381{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014382 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014383 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014384 0x000A, /* LINE FEED */
14385 0x000D, /* CARRIAGE RETURN */
14386 0x001C, /* FILE SEPARATOR */
14387 0x001D, /* GROUP SEPARATOR */
14388 0x001E, /* RECORD SEPARATOR */
14389 0x0085, /* NEXT LINE */
14390 0x2028, /* LINE SEPARATOR */
14391 0x2029, /* PARAGRAPH SEPARATOR */
14392 };
14393
Fred Drakee4315f52000-05-09 19:53:39 +000014394 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014395 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014396 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014397 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014398 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014399
Guido van Rossumcacfc072002-05-24 19:01:59 +000014400 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014401 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014402
14403 /* initialize the linebreak bloom filter */
14404 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014405 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014406 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014407
14408 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014409
Benjamin Petersonc4311282012-10-30 23:21:10 -040014410 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14411 Py_FatalError("Can't initialize field name iterator type");
14412
14413 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14414 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014415
Victor Stinner3a50e702011-10-18 21:21:00 +020014416#ifdef HAVE_MBCS
14417 winver.dwOSVersionInfoSize = sizeof(winver);
14418 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14419 PyErr_SetFromWindowsErr(0);
14420 return -1;
14421 }
14422#endif
14423 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014424}
14425
14426/* Finalize the Unicode implementation */
14427
Christian Heimesa156e092008-02-16 07:38:31 +000014428int
14429PyUnicode_ClearFreeList(void)
14430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014431 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014432}
14433
Guido van Rossumd57fd912000-03-10 22:53:23 +000014434void
Thomas Wouters78890102000-07-22 19:25:51 +000014435_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014436{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014437 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014438
Serhiy Storchaka05997252013-01-26 12:14:02 +020014439 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014440
Serhiy Storchaka05997252013-01-26 12:14:02 +020014441 for (i = 0; i < 256; i++)
14442 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014443 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014444 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014445}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014446
Walter Dörwald16807132007-05-25 13:52:07 +000014447void
14448PyUnicode_InternInPlace(PyObject **p)
14449{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014450 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014451 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014452#ifdef Py_DEBUG
14453 assert(s != NULL);
14454 assert(_PyUnicode_CHECK(s));
14455#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014456 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014457 return;
14458#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014459 /* If it's a subclass, we don't really know what putting
14460 it in the interned dict might do. */
14461 if (!PyUnicode_CheckExact(s))
14462 return;
14463 if (PyUnicode_CHECK_INTERNED(s))
14464 return;
14465 if (interned == NULL) {
14466 interned = PyDict_New();
14467 if (interned == NULL) {
14468 PyErr_Clear(); /* Don't leave an exception */
14469 return;
14470 }
14471 }
14472 /* It might be that the GetItem call fails even
14473 though the key is present in the dictionary,
14474 namely when this happens during a stack overflow. */
14475 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014476 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014477 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014478
Benjamin Peterson29060642009-01-31 22:14:21 +000014479 if (t) {
14480 Py_INCREF(t);
14481 Py_DECREF(*p);
14482 *p = t;
14483 return;
14484 }
Walter Dörwald16807132007-05-25 13:52:07 +000014485
Benjamin Peterson14339b62009-01-31 16:36:08 +000014486 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014487 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014488 PyErr_Clear();
14489 PyThreadState_GET()->recursion_critical = 0;
14490 return;
14491 }
14492 PyThreadState_GET()->recursion_critical = 0;
14493 /* The two references in interned are not counted by refcnt.
14494 The deallocator will take care of this */
14495 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014496 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014497}
14498
14499void
14500PyUnicode_InternImmortal(PyObject **p)
14501{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014502 PyUnicode_InternInPlace(p);
14503 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014504 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014505 Py_INCREF(*p);
14506 }
Walter Dörwald16807132007-05-25 13:52:07 +000014507}
14508
14509PyObject *
14510PyUnicode_InternFromString(const char *cp)
14511{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014512 PyObject *s = PyUnicode_FromString(cp);
14513 if (s == NULL)
14514 return NULL;
14515 PyUnicode_InternInPlace(&s);
14516 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014517}
14518
Alexander Belopolsky40018472011-02-26 01:02:56 +000014519void
14520_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014521{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014522 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014523 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014524 Py_ssize_t i, n;
14525 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014526
Benjamin Peterson14339b62009-01-31 16:36:08 +000014527 if (interned == NULL || !PyDict_Check(interned))
14528 return;
14529 keys = PyDict_Keys(interned);
14530 if (keys == NULL || !PyList_Check(keys)) {
14531 PyErr_Clear();
14532 return;
14533 }
Walter Dörwald16807132007-05-25 13:52:07 +000014534
Benjamin Peterson14339b62009-01-31 16:36:08 +000014535 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14536 detector, interned unicode strings are not forcibly deallocated;
14537 rather, we give them their stolen references back, and then clear
14538 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014539
Benjamin Peterson14339b62009-01-31 16:36:08 +000014540 n = PyList_GET_SIZE(keys);
14541 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014542 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014543 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014544 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014545 if (PyUnicode_READY(s) == -1) {
14546 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014547 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014549 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014550 case SSTATE_NOT_INTERNED:
14551 /* XXX Shouldn't happen */
14552 break;
14553 case SSTATE_INTERNED_IMMORTAL:
14554 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014555 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014556 break;
14557 case SSTATE_INTERNED_MORTAL:
14558 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014559 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014560 break;
14561 default:
14562 Py_FatalError("Inconsistent interned string state.");
14563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014564 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014565 }
14566 fprintf(stderr, "total size of all interned strings: "
14567 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14568 "mortal/immortal\n", mortal_size, immortal_size);
14569 Py_DECREF(keys);
14570 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014571 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014572}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014573
14574
14575/********************* Unicode Iterator **************************/
14576
14577typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014578 PyObject_HEAD
14579 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014580 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014581} unicodeiterobject;
14582
14583static void
14584unicodeiter_dealloc(unicodeiterobject *it)
14585{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014586 _PyObject_GC_UNTRACK(it);
14587 Py_XDECREF(it->it_seq);
14588 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014589}
14590
14591static int
14592unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14593{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014594 Py_VISIT(it->it_seq);
14595 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014596}
14597
14598static PyObject *
14599unicodeiter_next(unicodeiterobject *it)
14600{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014601 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014602
Benjamin Peterson14339b62009-01-31 16:36:08 +000014603 assert(it != NULL);
14604 seq = it->it_seq;
14605 if (seq == NULL)
14606 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014607 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014609 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14610 int kind = PyUnicode_KIND(seq);
14611 void *data = PyUnicode_DATA(seq);
14612 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14613 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014614 if (item != NULL)
14615 ++it->it_index;
14616 return item;
14617 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014618
Benjamin Peterson14339b62009-01-31 16:36:08 +000014619 Py_DECREF(seq);
14620 it->it_seq = NULL;
14621 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014622}
14623
14624static PyObject *
14625unicodeiter_len(unicodeiterobject *it)
14626{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014627 Py_ssize_t len = 0;
14628 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014629 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014630 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014631}
14632
14633PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14634
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014635static PyObject *
14636unicodeiter_reduce(unicodeiterobject *it)
14637{
14638 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014639 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014640 it->it_seq, it->it_index);
14641 } else {
14642 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14643 if (u == NULL)
14644 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014645 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014646 }
14647}
14648
14649PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14650
14651static PyObject *
14652unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14653{
14654 Py_ssize_t index = PyLong_AsSsize_t(state);
14655 if (index == -1 && PyErr_Occurred())
14656 return NULL;
14657 if (index < 0)
14658 index = 0;
14659 it->it_index = index;
14660 Py_RETURN_NONE;
14661}
14662
14663PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14664
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014665static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014666 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014667 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014668 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14669 reduce_doc},
14670 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14671 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014672 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014673};
14674
14675PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014676 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14677 "str_iterator", /* tp_name */
14678 sizeof(unicodeiterobject), /* tp_basicsize */
14679 0, /* tp_itemsize */
14680 /* methods */
14681 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14682 0, /* tp_print */
14683 0, /* tp_getattr */
14684 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014685 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014686 0, /* tp_repr */
14687 0, /* tp_as_number */
14688 0, /* tp_as_sequence */
14689 0, /* tp_as_mapping */
14690 0, /* tp_hash */
14691 0, /* tp_call */
14692 0, /* tp_str */
14693 PyObject_GenericGetAttr, /* tp_getattro */
14694 0, /* tp_setattro */
14695 0, /* tp_as_buffer */
14696 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14697 0, /* tp_doc */
14698 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14699 0, /* tp_clear */
14700 0, /* tp_richcompare */
14701 0, /* tp_weaklistoffset */
14702 PyObject_SelfIter, /* tp_iter */
14703 (iternextfunc)unicodeiter_next, /* tp_iternext */
14704 unicodeiter_methods, /* tp_methods */
14705 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014706};
14707
14708static PyObject *
14709unicode_iter(PyObject *seq)
14710{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014711 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014712
Benjamin Peterson14339b62009-01-31 16:36:08 +000014713 if (!PyUnicode_Check(seq)) {
14714 PyErr_BadInternalCall();
14715 return NULL;
14716 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014717 if (PyUnicode_READY(seq) == -1)
14718 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014719 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14720 if (it == NULL)
14721 return NULL;
14722 it->it_index = 0;
14723 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014724 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014725 _PyObject_GC_TRACK(it);
14726 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014727}
14728
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014729
14730size_t
14731Py_UNICODE_strlen(const Py_UNICODE *u)
14732{
14733 int res = 0;
14734 while(*u++)
14735 res++;
14736 return res;
14737}
14738
14739Py_UNICODE*
14740Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14741{
14742 Py_UNICODE *u = s1;
14743 while ((*u++ = *s2++));
14744 return s1;
14745}
14746
14747Py_UNICODE*
14748Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14749{
14750 Py_UNICODE *u = s1;
14751 while ((*u++ = *s2++))
14752 if (n-- == 0)
14753 break;
14754 return s1;
14755}
14756
14757Py_UNICODE*
14758Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14759{
14760 Py_UNICODE *u1 = s1;
14761 u1 += Py_UNICODE_strlen(u1);
14762 Py_UNICODE_strcpy(u1, s2);
14763 return s1;
14764}
14765
14766int
14767Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14768{
14769 while (*s1 && *s2 && *s1 == *s2)
14770 s1++, s2++;
14771 if (*s1 && *s2)
14772 return (*s1 < *s2) ? -1 : +1;
14773 if (*s1)
14774 return 1;
14775 if (*s2)
14776 return -1;
14777 return 0;
14778}
14779
14780int
14781Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14782{
14783 register Py_UNICODE u1, u2;
14784 for (; n != 0; n--) {
14785 u1 = *s1;
14786 u2 = *s2;
14787 if (u1 != u2)
14788 return (u1 < u2) ? -1 : +1;
14789 if (u1 == '\0')
14790 return 0;
14791 s1++;
14792 s2++;
14793 }
14794 return 0;
14795}
14796
14797Py_UNICODE*
14798Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14799{
14800 const Py_UNICODE *p;
14801 for (p = s; *p; p++)
14802 if (*p == c)
14803 return (Py_UNICODE*)p;
14804 return NULL;
14805}
14806
14807Py_UNICODE*
14808Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14809{
14810 const Py_UNICODE *p;
14811 p = s + Py_UNICODE_strlen(s);
14812 while (p != s) {
14813 p--;
14814 if (*p == c)
14815 return (Py_UNICODE*)p;
14816 }
14817 return NULL;
14818}
Victor Stinner331ea922010-08-10 16:37:20 +000014819
Victor Stinner71133ff2010-09-01 23:43:53 +000014820Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014821PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014822{
Victor Stinner577db2c2011-10-11 22:12:48 +020014823 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014824 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014826 if (!PyUnicode_Check(unicode)) {
14827 PyErr_BadArgument();
14828 return NULL;
14829 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014830 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014831 if (u == NULL)
14832 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014833 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014834 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014835 PyErr_NoMemory();
14836 return NULL;
14837 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014838 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014839 size *= sizeof(Py_UNICODE);
14840 copy = PyMem_Malloc(size);
14841 if (copy == NULL) {
14842 PyErr_NoMemory();
14843 return NULL;
14844 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014845 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014846 return copy;
14847}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014848
Georg Brandl66c221e2010-10-14 07:04:07 +000014849/* A _string module, to export formatter_parser and formatter_field_name_split
14850 to the string.Formatter class implemented in Python. */
14851
14852static PyMethodDef _string_methods[] = {
14853 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14854 METH_O, PyDoc_STR("split the argument as a field name")},
14855 {"formatter_parser", (PyCFunction) formatter_parser,
14856 METH_O, PyDoc_STR("parse the argument as a format string")},
14857 {NULL, NULL}
14858};
14859
14860static struct PyModuleDef _string_module = {
14861 PyModuleDef_HEAD_INIT,
14862 "_string",
14863 PyDoc_STR("string helper module"),
14864 0,
14865 _string_methods,
14866 NULL,
14867 NULL,
14868 NULL,
14869 NULL
14870};
14871
14872PyMODINIT_FUNC
14873PyInit__string(void)
14874{
14875 return PyModule_Create(&_string_module);
14876}
14877
14878
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014879#ifdef __cplusplus
14880}
14881#endif