blob: e0b507f3f080b0bb49cb0307b061e3f3a21eea5e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinnere6abb482012-05-02 01:15:40 +0200107/* Optimized version of Py_MAX() to compute the maximum character:
108 use it when your are computing the second argument of PyUnicode_New() */
109#define MAX_MAXCHAR(maxchar1, maxchar2) \
110 ((maxchar1) | (maxchar2))
111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (!PyUnicode_IS_COMPACT_ASCII(op) \
132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (_PyUnicode_WSTR(op) && \
140 (!PyUnicode_IS_READY(op) || \
141 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
142
Victor Stinner910337b2011-10-03 03:20:16 +0200143/* Generic helper macro to convert characters of different types.
144 from_type and to_type have to be valid type names, begin and end
145 are pointers to the source characters which should be of type
146 "from_type *". to is a pointer of type "to_type *" and points to the
147 buffer where the result characters are written to. */
148#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
149 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200150 to_type *_to = (to_type *) to; \
151 const from_type *_iter = (begin); \
152 const from_type *_end = (end); \
153 Py_ssize_t n = (_end) - (_iter); \
154 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200155 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_unrolled_end)) { \
157 _to[0] = (to_type) _iter[0]; \
158 _to[1] = (to_type) _iter[1]; \
159 _to[2] = (to_type) _iter[2]; \
160 _to[3] = (to_type) _iter[3]; \
161 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200162 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_end)) \
164 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166
Walter Dörwald16807132007-05-25 13:52:07 +0000167/* This dictionary holds all interned unicode strings. Note that references
168 to strings in this dictionary are *not* counted in the string's ob_refcnt.
169 When the interned string reaches a refcnt of 0 the string deallocation
170 function will delete the reference from this dictionary.
171
172 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000173 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000174*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200175static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000177/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179
Serhiy Storchaka678db842013-01-26 12:16:36 +0200180#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200181 do { \
182 if (unicode_empty != NULL) \
183 Py_INCREF(unicode_empty); \
184 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185 unicode_empty = PyUnicode_New(0, 0); \
186 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
189 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200191 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193#define _Py_RETURN_UNICODE_EMPTY() \
194 do { \
195 _Py_INCREF_UNICODE_EMPTY(); \
196 return unicode_empty; \
197 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200199/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202/* Single character Unicode strings in the Latin-1 range are being
203 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Christian Heimes190d79e2008-01-30 11:58:22 +0000206/* Fast detection of the most frequent whitespace characters */
207const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000C: * FORM FEED */
213/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000214 0, 1, 1, 1, 1, 1, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000216/* case 0x001C: * FILE SEPARATOR */
217/* case 0x001D: * GROUP SEPARATOR */
218/* case 0x001E: * RECORD SEPARATOR */
219/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 1, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000226
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000235};
236
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100240static int unicode_modifiable(PyObject *unicode);
241
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100244_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200245static PyObject *
246_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
247static PyObject *
248_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
249
250static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000251unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100253 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256static void
257raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300258 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100259 PyObject *unicode,
260 Py_ssize_t startpos, Py_ssize_t endpos,
261 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000262
Christian Heimes190d79e2008-01-30 11:58:22 +0000263/* Same for linebreaks */
264static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267/* 0x000B, * LINE TABULATION */
268/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x001C, * FILE SEPARATOR */
273/* 0x001D, * GROUP SEPARATOR */
274/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 1, 1, 1, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000280
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000289};
290
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300291/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
292 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000294PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000296#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 /* This is actually an illegal character, so it should
300 not be passed to unichr. */
301 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#endif
303}
304
Victor Stinner910337b2011-10-03 03:20:16 +0200305#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200306int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100307_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200308{
309 PyASCIIObject *ascii;
310 unsigned int kind;
311
312 assert(PyUnicode_Check(op));
313
314 ascii = (PyASCIIObject *)op;
315 kind = ascii->state.kind;
316
Victor Stinnera3b334d2011-10-03 13:53:37 +0200317 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200318 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
320 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200322 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200323 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200324
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 if (ascii->state.compact == 1) {
326 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200327 assert(kind == PyUnicode_1BYTE_KIND
328 || kind == PyUnicode_2BYTE_KIND
329 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 }
334 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
336
337 data = unicode->data.any;
338 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100339 assert(ascii->length == 0);
340 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ascii == 0);
343 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100344 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->wstr != NULL);
346 assert(data == NULL);
347 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200348 }
349 else {
350 assert(kind == PyUnicode_1BYTE_KIND
351 || kind == PyUnicode_2BYTE_KIND
352 || kind == PyUnicode_4BYTE_KIND);
353 assert(ascii->state.compact == 0);
354 assert(ascii->state.ready == 1);
355 assert(data != NULL);
356 if (ascii->state.ascii) {
357 assert (compact->utf8 == data);
358 assert (compact->utf8_length == ascii->length);
359 }
360 else
361 assert (compact->utf8 != data);
362 }
363 }
364 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200365 if (
366#if SIZEOF_WCHAR_T == 2
367 kind == PyUnicode_2BYTE_KIND
368#else
369 kind == PyUnicode_4BYTE_KIND
370#endif
371 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200372 {
373 assert(ascii->wstr == data);
374 assert(compact->wstr_length == ascii->length);
375 } else
376 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200378
379 if (compact->utf8 == NULL)
380 assert(compact->utf8_length == 0);
381 if (ascii->wstr == NULL)
382 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 /* check that the best kind is used */
385 if (check_content && kind != PyUnicode_WCHAR_KIND)
386 {
387 Py_ssize_t i;
388 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 void *data;
390 Py_UCS4 ch;
391
392 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 for (i=0; i < ascii->length; i++)
394 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 if (ch > maxchar)
397 maxchar = ch;
398 }
399 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100400 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 assert(maxchar <= 255);
403 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 else
405 assert(maxchar < 128);
406 }
Victor Stinner77faf692011-11-20 18:56:05 +0100407 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100409 assert(maxchar <= 0xFFFF);
410 }
411 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100413 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100414 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200415 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400417 return 1;
418}
Victor Stinner910337b2011-10-03 03:20:16 +0200419#endif
420
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100421static PyObject*
422unicode_result_wchar(PyObject *unicode)
423{
424#ifndef Py_DEBUG
425 Py_ssize_t len;
426
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100427 len = _PyUnicode_WSTR_LENGTH(unicode);
428 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200430 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 }
432
433 if (len == 1) {
434 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100435 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100436 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
437 Py_DECREF(unicode);
438 return latin1_char;
439 }
440 }
441
442 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200443 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100444 return NULL;
445 }
446#else
Victor Stinneraa771272012-10-04 02:32:58 +0200447 assert(Py_REFCNT(unicode) == 1);
448
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100449 /* don't make the result ready in debug mode to ensure that the caller
450 makes the string ready before using it */
451 assert(_PyUnicode_CheckConsistency(unicode, 1));
452#endif
453 return unicode;
454}
455
456static PyObject*
457unicode_result_ready(PyObject *unicode)
458{
459 Py_ssize_t length;
460
461 length = PyUnicode_GET_LENGTH(unicode);
462 if (length == 0) {
463 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100464 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200465 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 }
467 return unicode_empty;
468 }
469
470 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200471 void *data = PyUnicode_DATA(unicode);
472 int kind = PyUnicode_KIND(unicode);
473 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 if (ch < 256) {
475 PyObject *latin1_char = unicode_latin1[ch];
476 if (latin1_char != NULL) {
477 if (unicode != latin1_char) {
478 Py_INCREF(latin1_char);
479 Py_DECREF(unicode);
480 }
481 return latin1_char;
482 }
483 else {
484 assert(_PyUnicode_CheckConsistency(unicode, 1));
485 Py_INCREF(unicode);
486 unicode_latin1[ch] = unicode;
487 return unicode;
488 }
489 }
490 }
491
492 assert(_PyUnicode_CheckConsistency(unicode, 1));
493 return unicode;
494}
495
496static PyObject*
497unicode_result(PyObject *unicode)
498{
499 assert(_PyUnicode_CHECK(unicode));
500 if (PyUnicode_IS_READY(unicode))
501 return unicode_result_ready(unicode);
502 else
503 return unicode_result_wchar(unicode);
504}
505
Victor Stinnerc4b49542011-12-11 22:44:26 +0100506static PyObject*
507unicode_result_unchanged(PyObject *unicode)
508{
509 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500510 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100511 return NULL;
512 Py_INCREF(unicode);
513 return unicode;
514 }
515 else
516 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100517 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100518}
519
Victor Stinner3a50e702011-10-18 21:21:00 +0200520#ifdef HAVE_MBCS
521static OSVERSIONINFOEX winver;
522#endif
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200590#define BLOOM_MEMBER(mask, chr, str) \
591 (BLOOM(mask, chr) \
592 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000593
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200594/* Compilation of templated routines */
595
596#include "stringlib/asciilib.h"
597#include "stringlib/fastsearch.h"
598#include "stringlib/partition.h"
599#include "stringlib/split.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
602#include "stringlib/find_max_char.h"
603#include "stringlib/localeutil.h"
604#include "stringlib/undef.h"
605
606#include "stringlib/ucs1lib.h"
607#include "stringlib/fastsearch.h"
608#include "stringlib/partition.h"
609#include "stringlib/split.h"
610#include "stringlib/count.h"
611#include "stringlib/find.h"
612#include "stringlib/find_max_char.h"
613#include "stringlib/localeutil.h"
614#include "stringlib/undef.h"
615
616#include "stringlib/ucs2lib.h"
617#include "stringlib/fastsearch.h"
618#include "stringlib/partition.h"
619#include "stringlib/split.h"
620#include "stringlib/count.h"
621#include "stringlib/find.h"
622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
632#include "stringlib/find_max_char.h"
633#include "stringlib/localeutil.h"
634#include "stringlib/undef.h"
635
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200636#include "stringlib/unicodedefs.h"
637#include "stringlib/fastsearch.h"
638#include "stringlib/count.h"
639#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100640#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200641
Guido van Rossumd57fd912000-03-10 22:53:23 +0000642/* --- Unicode Object ----------------------------------------------------- */
643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200644static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200645fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
648 Py_ssize_t size, Py_UCS4 ch,
649 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200650{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200651 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
652
653 switch (kind) {
654 case PyUnicode_1BYTE_KIND:
655 {
656 Py_UCS1 ch1 = (Py_UCS1) ch;
657 if (ch1 == ch)
658 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
659 else
660 return -1;
661 }
662 case PyUnicode_2BYTE_KIND:
663 {
664 Py_UCS2 ch2 = (Py_UCS2) ch;
665 if (ch2 == ch)
666 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
667 else
668 return -1;
669 }
670 case PyUnicode_4BYTE_KIND:
671 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
672 default:
673 assert(0);
674 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676}
677
Victor Stinnerafffce42012-10-03 23:03:17 +0200678#ifdef Py_DEBUG
679/* Fill the data of an Unicode string with invalid characters to detect bugs
680 earlier.
681
682 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
683 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
684 invalid character in Unicode 6.0. */
685static void
686unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
687{
688 int kind = PyUnicode_KIND(unicode);
689 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
690 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
691 if (length <= old_length)
692 return;
693 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
694}
695#endif
696
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697static PyObject*
698resize_compact(PyObject *unicode, Py_ssize_t length)
699{
700 Py_ssize_t char_size;
701 Py_ssize_t struct_size;
702 Py_ssize_t new_size;
703 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100704 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200705#ifdef Py_DEBUG
706 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
707#endif
708
Victor Stinner79891572012-05-03 13:43:07 +0200709 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200710 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100711 assert(PyUnicode_IS_COMPACT(unicode));
712
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200713 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100714 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715 struct_size = sizeof(PyASCIIObject);
716 else
717 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
721 PyErr_NoMemory();
722 return NULL;
723 }
724 new_size = (struct_size + (length + 1) * char_size);
725
Victor Stinner84def372011-12-11 20:04:56 +0100726 _Py_DEC_REFTOTAL;
727 _Py_ForgetReference(unicode);
728
729 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
730 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100731 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 PyErr_NoMemory();
733 return NULL;
734 }
Victor Stinner84def372011-12-11 20:04:56 +0100735 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100737
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200739 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100741 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200742 _PyUnicode_WSTR_LENGTH(unicode) = length;
743 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100744 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
745 PyObject_DEL(_PyUnicode_WSTR(unicode));
746 _PyUnicode_WSTR(unicode) = NULL;
747 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200748#ifdef Py_DEBUG
749 unicode_fill_invalid(unicode, old_length);
750#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200751 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
752 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200753 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200754 return unicode;
755}
756
Alexander Belopolsky40018472011-02-26 01:02:56 +0000757static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200758resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000759{
Victor Stinner95663112011-10-04 01:03:50 +0200760 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100761 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200762 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200763 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000764
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 if (PyUnicode_IS_READY(unicode)) {
766 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200767 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200769#ifdef Py_DEBUG
770 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
771#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200772
773 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200774 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200775 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
776 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200777
778 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
779 PyErr_NoMemory();
780 return -1;
781 }
782 new_size = (length + 1) * char_size;
783
Victor Stinner7a9105a2011-12-12 00:13:42 +0100784 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
785 {
786 PyObject_DEL(_PyUnicode_UTF8(unicode));
787 _PyUnicode_UTF8(unicode) = NULL;
788 _PyUnicode_UTF8_LENGTH(unicode) = 0;
789 }
790
Victor Stinnerfe226c02011-10-03 03:52:20 +0200791 data = (PyObject *)PyObject_REALLOC(data, new_size);
792 if (data == NULL) {
793 PyErr_NoMemory();
794 return -1;
795 }
796 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200797 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200799 _PyUnicode_WSTR_LENGTH(unicode) = length;
800 }
801 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200802 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200803 _PyUnicode_UTF8_LENGTH(unicode) = length;
804 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 _PyUnicode_LENGTH(unicode) = length;
806 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200807#ifdef Py_DEBUG
808 unicode_fill_invalid(unicode, old_length);
809#endif
Victor Stinner95663112011-10-04 01:03:50 +0200810 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200811 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinner95663112011-10-04 01:03:50 +0200815 assert(_PyUnicode_WSTR(unicode) != NULL);
816
817 /* check for integer overflow */
818 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
819 PyErr_NoMemory();
820 return -1;
821 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100822 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200823 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100824 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200825 if (!wstr) {
826 PyErr_NoMemory();
827 return -1;
828 }
829 _PyUnicode_WSTR(unicode) = wstr;
830 _PyUnicode_WSTR(unicode)[length] = 0;
831 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200832 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000833 return 0;
834}
835
Victor Stinnerfe226c02011-10-03 03:52:20 +0200836static PyObject*
837resize_copy(PyObject *unicode, Py_ssize_t length)
838{
839 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100840 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200841 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100842
Benjamin Petersonbac79492012-01-14 13:34:47 -0500843 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100844 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200845
846 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
847 if (copy == NULL)
848 return NULL;
849
850 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200851 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200852 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200853 }
854 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200855 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100856
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200857 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200858 if (w == NULL)
859 return NULL;
860 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
861 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200862 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
863 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200864 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200865 }
866}
867
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000869 Ux0000 terminated; some code (e.g. new_identifier)
870 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871
872 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000873 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874
875*/
876
Alexander Belopolsky40018472011-02-26 01:02:56 +0000877static PyUnicodeObject *
878_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879{
880 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882
Thomas Wouters477c8d52006-05-27 19:21:47 +0000883 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 if (length == 0 && unicode_empty != NULL) {
885 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200886 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000887 }
888
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000889 /* Ensure we won't overflow the size. */
890 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
891 return (PyUnicodeObject *)PyErr_NoMemory();
892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200893 if (length < 0) {
894 PyErr_SetString(PyExc_SystemError,
895 "Negative size passed to _PyUnicode_New");
896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000897 }
898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
900 if (unicode == NULL)
901 return NULL;
902 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
903 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
904 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100905 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000906 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100907 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200909
Jeremy Hyltond8082792003-09-16 19:41:39 +0000910 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000911 * the caller fails before initializing str -- unicode_resize()
912 * reads str[0], and the Keep-Alive optimization can keep memory
913 * allocated for str alive across a call to unicode_dealloc(unicode).
914 * We don't want unicode_resize to read uninitialized memory in
915 * that case.
916 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode)[0] = 0;
918 _PyUnicode_WSTR(unicode)[length] = 0;
919 _PyUnicode_WSTR_LENGTH(unicode) = length;
920 _PyUnicode_HASH(unicode) = -1;
921 _PyUnicode_STATE(unicode).interned = 0;
922 _PyUnicode_STATE(unicode).kind = 0;
923 _PyUnicode_STATE(unicode).compact = 0;
924 _PyUnicode_STATE(unicode).ready = 0;
925 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200926 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200928 _PyUnicode_UTF8(unicode) = NULL;
929 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100930 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000931 return unicode;
932}
933
Victor Stinnerf42dc442011-10-02 23:33:16 +0200934static const char*
935unicode_kind_name(PyObject *unicode)
936{
Victor Stinner42dfd712011-10-03 14:41:45 +0200937 /* don't check consistency: unicode_kind_name() is called from
938 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200939 if (!PyUnicode_IS_COMPACT(unicode))
940 {
941 if (!PyUnicode_IS_READY(unicode))
942 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600943 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200944 {
945 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200946 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200947 return "legacy ascii";
948 else
949 return "legacy latin1";
950 case PyUnicode_2BYTE_KIND:
951 return "legacy UCS2";
952 case PyUnicode_4BYTE_KIND:
953 return "legacy UCS4";
954 default:
955 return "<legacy invalid kind>";
956 }
957 }
958 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600959 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200960 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200961 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200962 return "ascii";
963 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200965 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200966 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200967 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 default:
970 return "<invalid compact kind>";
971 }
972}
973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975/* Functions wrapping macros for use in debugger */
976char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200977 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978}
979
980void *_PyUnicode_compact_data(void *unicode) {
981 return _PyUnicode_COMPACT_DATA(unicode);
982}
983void *_PyUnicode_data(void *unicode){
984 printf("obj %p\n", unicode);
985 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
986 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
987 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
988 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
989 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
990 return PyUnicode_DATA(unicode);
991}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200992
993void
994_PyUnicode_Dump(PyObject *op)
995{
996 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200997 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
998 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
999 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001000
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001002 {
1003 if (ascii->state.ascii)
1004 data = (ascii + 1);
1005 else
1006 data = (compact + 1);
1007 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001008 else
1009 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001010 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1011
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 if (ascii->wstr == data)
1013 printf("shared ");
1014 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001015
Victor Stinnera3b334d2011-10-03 13:53:37 +02001016 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 printf(" (%zu), ", compact->wstr_length);
1018 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1019 printf("shared ");
1020 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001021 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001022 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001023}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001024#endif
1025
1026PyObject *
1027PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1028{
1029 PyObject *obj;
1030 PyCompactUnicodeObject *unicode;
1031 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001032 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001033 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 Py_ssize_t char_size;
1035 Py_ssize_t struct_size;
1036
1037 /* Optimization for empty strings */
1038 if (size == 0 && unicode_empty != NULL) {
1039 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001040 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 }
1042
Victor Stinner9e9d6892011-10-04 01:02:02 +02001043 is_ascii = 0;
1044 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 struct_size = sizeof(PyCompactUnicodeObject);
1046 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001047 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 char_size = 1;
1049 is_ascii = 1;
1050 struct_size = sizeof(PyASCIIObject);
1051 }
1052 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001053 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 char_size = 1;
1055 }
1056 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 2;
1059 if (sizeof(wchar_t) == 2)
1060 is_sharing = 1;
1061 }
1062 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001063 if (maxchar > MAX_UNICODE) {
1064 PyErr_SetString(PyExc_SystemError,
1065 "invalid maximum character passed to PyUnicode_New");
1066 return NULL;
1067 }
Victor Stinner8f825062012-04-27 13:55:39 +02001068 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 char_size = 4;
1070 if (sizeof(wchar_t) == 4)
1071 is_sharing = 1;
1072 }
1073
1074 /* Ensure we won't overflow the size. */
1075 if (size < 0) {
1076 PyErr_SetString(PyExc_SystemError,
1077 "Negative size passed to PyUnicode_New");
1078 return NULL;
1079 }
1080 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1081 return PyErr_NoMemory();
1082
1083 /* Duplicated allocation code from _PyObject_New() instead of a call to
1084 * PyObject_New() so we are able to allocate space for the object and
1085 * it's data buffer.
1086 */
1087 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1088 if (obj == NULL)
1089 return PyErr_NoMemory();
1090 obj = PyObject_INIT(obj, &PyUnicode_Type);
1091 if (obj == NULL)
1092 return NULL;
1093
1094 unicode = (PyCompactUnicodeObject *)obj;
1095 if (is_ascii)
1096 data = ((PyASCIIObject*)obj) + 1;
1097 else
1098 data = unicode + 1;
1099 _PyUnicode_LENGTH(unicode) = size;
1100 _PyUnicode_HASH(unicode) = -1;
1101 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001102 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 _PyUnicode_STATE(unicode).compact = 1;
1104 _PyUnicode_STATE(unicode).ready = 1;
1105 _PyUnicode_STATE(unicode).ascii = is_ascii;
1106 if (is_ascii) {
1107 ((char*)data)[size] = 0;
1108 _PyUnicode_WSTR(unicode) = NULL;
1109 }
Victor Stinner8f825062012-04-27 13:55:39 +02001110 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001115 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 else {
1118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001122 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 ((Py_UCS4*)data)[size] = 0;
1124 if (is_sharing) {
1125 _PyUnicode_WSTR_LENGTH(unicode) = size;
1126 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1127 }
1128 else {
1129 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1130 _PyUnicode_WSTR(unicode) = NULL;
1131 }
1132 }
Victor Stinner8f825062012-04-27 13:55:39 +02001133#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001134 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001135#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001136 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 return obj;
1138}
1139
1140#if SIZEOF_WCHAR_T == 2
1141/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1142 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001143 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144
1145 This function assumes that unicode can hold one more code point than wstr
1146 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001147static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001149 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150{
1151 const wchar_t *iter;
1152 Py_UCS4 *ucs4_out;
1153
Victor Stinner910337b2011-10-03 03:20:16 +02001154 assert(unicode != NULL);
1155 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1157 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1158
1159 for (iter = begin; iter < end; ) {
1160 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1161 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001162 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1163 && (iter+1) < end
1164 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 {
Victor Stinner551ac952011-11-29 22:58:13 +01001166 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 iter += 2;
1168 }
1169 else {
1170 *ucs4_out++ = *iter;
1171 iter++;
1172 }
1173 }
1174 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1175 _PyUnicode_GET_LENGTH(unicode)));
1176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001177}
1178#endif
1179
Victor Stinnercd9950f2011-10-02 00:34:53 +02001180static int
Victor Stinner488fa492011-12-12 00:01:39 +01001181unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001182{
Victor Stinner488fa492011-12-12 00:01:39 +01001183 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001184 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001185 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186 return -1;
1187 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001188 return 0;
1189}
1190
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001191static int
1192_copy_characters(PyObject *to, Py_ssize_t to_start,
1193 PyObject *from, Py_ssize_t from_start,
1194 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001195{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001196 unsigned int from_kind, to_kind;
1197 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198
Victor Stinneree4544c2012-05-09 22:24:08 +02001199 assert(0 <= how_many);
1200 assert(0 <= from_start);
1201 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001203 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001204 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205
Victor Stinnerd3f08822012-05-29 12:57:52 +02001206 assert(PyUnicode_Check(to));
1207 assert(PyUnicode_IS_READY(to));
1208 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1209
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001210 if (how_many == 0)
1211 return 0;
1212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001214 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001215 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217
Victor Stinnerf1852262012-06-16 16:38:26 +02001218#ifdef Py_DEBUG
1219 if (!check_maxchar
1220 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1221 {
1222 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1223 Py_UCS4 ch;
1224 Py_ssize_t i;
1225 for (i=0; i < how_many; i++) {
1226 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1227 assert(ch <= to_maxchar);
1228 }
1229 }
1230#endif
1231
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001233 if (check_maxchar
1234 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1235 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 /* Writing Latin-1 characters into an ASCII string requires to
1237 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001238 Py_UCS4 max_char;
1239 max_char = ucs1lib_find_max_char(from_data,
1240 (Py_UCS1*)from_data + how_many);
1241 if (max_char >= 128)
1242 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001243 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001244 Py_MEMCPY((char*)to_data + to_kind * to_start,
1245 (char*)from_data + from_kind * from_start,
1246 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001248 else if (from_kind == PyUnicode_1BYTE_KIND
1249 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001250 {
1251 _PyUnicode_CONVERT_BYTES(
1252 Py_UCS1, Py_UCS2,
1253 PyUnicode_1BYTE_DATA(from) + from_start,
1254 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1255 PyUnicode_2BYTE_DATA(to) + to_start
1256 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001257 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001258 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001259 && to_kind == PyUnicode_4BYTE_KIND)
1260 {
1261 _PyUnicode_CONVERT_BYTES(
1262 Py_UCS1, Py_UCS4,
1263 PyUnicode_1BYTE_DATA(from) + from_start,
1264 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1265 PyUnicode_4BYTE_DATA(to) + to_start
1266 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001267 }
1268 else if (from_kind == PyUnicode_2BYTE_KIND
1269 && to_kind == PyUnicode_4BYTE_KIND)
1270 {
1271 _PyUnicode_CONVERT_BYTES(
1272 Py_UCS2, Py_UCS4,
1273 PyUnicode_2BYTE_DATA(from) + from_start,
1274 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1275 PyUnicode_4BYTE_DATA(to) + to_start
1276 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001277 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001278 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001279 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1280
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001281 if (!check_maxchar) {
1282 if (from_kind == PyUnicode_2BYTE_KIND
1283 && to_kind == PyUnicode_1BYTE_KIND)
1284 {
1285 _PyUnicode_CONVERT_BYTES(
1286 Py_UCS2, Py_UCS1,
1287 PyUnicode_2BYTE_DATA(from) + from_start,
1288 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1289 PyUnicode_1BYTE_DATA(to) + to_start
1290 );
1291 }
1292 else if (from_kind == PyUnicode_4BYTE_KIND
1293 && to_kind == PyUnicode_1BYTE_KIND)
1294 {
1295 _PyUnicode_CONVERT_BYTES(
1296 Py_UCS4, Py_UCS1,
1297 PyUnicode_4BYTE_DATA(from) + from_start,
1298 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1299 PyUnicode_1BYTE_DATA(to) + to_start
1300 );
1301 }
1302 else if (from_kind == PyUnicode_4BYTE_KIND
1303 && to_kind == PyUnicode_2BYTE_KIND)
1304 {
1305 _PyUnicode_CONVERT_BYTES(
1306 Py_UCS4, Py_UCS2,
1307 PyUnicode_4BYTE_DATA(from) + from_start,
1308 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1309 PyUnicode_2BYTE_DATA(to) + to_start
1310 );
1311 }
1312 else {
1313 assert(0);
1314 return -1;
1315 }
1316 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001317 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001318 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001319 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001320 Py_ssize_t i;
1321
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 for (i=0; i < how_many; i++) {
1323 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001324 if (ch > to_maxchar)
1325 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1327 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001328 }
1329 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 return 0;
1331}
1332
Victor Stinnerd3f08822012-05-29 12:57:52 +02001333void
1334_PyUnicode_FastCopyCharacters(
1335 PyObject *to, Py_ssize_t to_start,
1336 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001337{
1338 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1339}
1340
1341Py_ssize_t
1342PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1343 PyObject *from, Py_ssize_t from_start,
1344 Py_ssize_t how_many)
1345{
1346 int err;
1347
1348 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1349 PyErr_BadInternalCall();
1350 return -1;
1351 }
1352
Benjamin Petersonbac79492012-01-14 13:34:47 -05001353 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001354 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001355 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001356 return -1;
1357
Victor Stinnerd3f08822012-05-29 12:57:52 +02001358 if (from_start < 0) {
1359 PyErr_SetString(PyExc_IndexError, "string index out of range");
1360 return -1;
1361 }
1362 if (to_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001366 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1367 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1368 PyErr_Format(PyExc_SystemError,
1369 "Cannot write %zi characters at %zi "
1370 "in a string of %zi characters",
1371 how_many, to_start, PyUnicode_GET_LENGTH(to));
1372 return -1;
1373 }
1374
1375 if (how_many == 0)
1376 return 0;
1377
Victor Stinner488fa492011-12-12 00:01:39 +01001378 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001379 return -1;
1380
1381 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1382 if (err) {
1383 PyErr_Format(PyExc_SystemError,
1384 "Cannot copy %s characters "
1385 "into a string of %s characters",
1386 unicode_kind_name(from),
1387 unicode_kind_name(to));
1388 return -1;
1389 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391}
1392
Victor Stinner17222162011-09-28 22:15:37 +02001393/* Find the maximum code point and count the number of surrogate pairs so a
1394 correct string length can be computed before converting a string to UCS4.
1395 This function counts single surrogates as a character and not as a pair.
1396
1397 Return 0 on success, or -1 on error. */
1398static int
1399find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1400 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401{
1402 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001403 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404
Victor Stinnerc53be962011-10-02 21:33:54 +02001405 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 *num_surrogates = 0;
1407 *maxchar = 0;
1408
1409 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001411 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1412 && (iter+1) < end
1413 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1414 {
1415 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1416 ++(*num_surrogates);
1417 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 }
1419 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001421 {
1422 ch = *iter;
1423 iter++;
1424 }
1425 if (ch > *maxchar) {
1426 *maxchar = ch;
1427 if (*maxchar > MAX_UNICODE) {
1428 PyErr_Format(PyExc_ValueError,
1429 "character U+%x is not in range [U+0000; U+10ffff]",
1430 ch);
1431 return -1;
1432 }
1433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
1435 return 0;
1436}
1437
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001438int
1439_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440{
1441 wchar_t *end;
1442 Py_UCS4 maxchar = 0;
1443 Py_ssize_t num_surrogates;
1444#if SIZEOF_WCHAR_T == 2
1445 Py_ssize_t length_wo_surrogates;
1446#endif
1447
Georg Brandl7597add2011-10-05 16:36:47 +02001448 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001449 strings were created using _PyObject_New() and where no canonical
1450 representation (the str field) has been set yet aka strings
1451 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001452 assert(_PyUnicode_CHECK(unicode));
1453 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001455 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001457 /* Actually, it should neither be interned nor be anything else: */
1458 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001461 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001462 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464
1465 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1467 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 PyErr_NoMemory();
1469 return -1;
1470 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001471 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 _PyUnicode_WSTR(unicode), end,
1473 PyUnicode_1BYTE_DATA(unicode));
1474 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1475 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1476 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1477 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001478 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001479 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001480 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
1482 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001483 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8(unicode) = NULL;
1485 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 }
1487 PyObject_FREE(_PyUnicode_WSTR(unicode));
1488 _PyUnicode_WSTR(unicode) = NULL;
1489 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1490 }
1491 /* In this case we might have to convert down from 4-byte native
1492 wchar_t to 2-byte unicode. */
1493 else if (maxchar < 65536) {
1494 assert(num_surrogates == 0 &&
1495 "FindMaxCharAndNumSurrogatePairs() messed up");
1496
Victor Stinner506f5922011-09-28 22:34:18 +02001497#if SIZEOF_WCHAR_T == 2
1498 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001499 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001500 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1501 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1502 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001503 _PyUnicode_UTF8(unicode) = NULL;
1504 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001505#else
1506 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001507 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001508 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001509 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001510 PyErr_NoMemory();
1511 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 }
Victor Stinner506f5922011-09-28 22:34:18 +02001513 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1514 _PyUnicode_WSTR(unicode), end,
1515 PyUnicode_2BYTE_DATA(unicode));
1516 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1517 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1518 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001519 _PyUnicode_UTF8(unicode) = NULL;
1520 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001521 PyObject_FREE(_PyUnicode_WSTR(unicode));
1522 _PyUnicode_WSTR(unicode) = NULL;
1523 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1524#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525 }
1526 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1527 else {
1528#if SIZEOF_WCHAR_T == 2
1529 /* in case the native representation is 2-bytes, we need to allocate a
1530 new normalized 4-byte version. */
1531 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001532 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1533 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 PyErr_NoMemory();
1535 return -1;
1536 }
1537 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1538 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001539 _PyUnicode_UTF8(unicode) = NULL;
1540 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001541 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1542 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001543 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544 PyObject_FREE(_PyUnicode_WSTR(unicode));
1545 _PyUnicode_WSTR(unicode) = NULL;
1546 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1547#else
1548 assert(num_surrogates == 0);
1549
Victor Stinnerc3c74152011-10-02 20:39:55 +02001550 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001552 _PyUnicode_UTF8(unicode) = NULL;
1553 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1555#endif
1556 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1557 }
1558 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001559 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 return 0;
1561}
1562
Alexander Belopolsky40018472011-02-26 01:02:56 +00001563static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001564unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565{
Walter Dörwald16807132007-05-25 13:52:07 +00001566 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 case SSTATE_NOT_INTERNED:
1568 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001569
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 case SSTATE_INTERNED_MORTAL:
1571 /* revive dead object temporarily for DelItem */
1572 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001573 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 Py_FatalError(
1575 "deletion of interned string failed");
1576 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001577
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 case SSTATE_INTERNED_IMMORTAL:
1579 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001580
Benjamin Peterson29060642009-01-31 22:14:21 +00001581 default:
1582 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001583 }
1584
Victor Stinner03490912011-10-03 23:45:12 +02001585 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001587 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001588 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001589 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1590 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001591
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001592 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593}
1594
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001595#ifdef Py_DEBUG
1596static int
1597unicode_is_singleton(PyObject *unicode)
1598{
1599 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1600 if (unicode == unicode_empty)
1601 return 1;
1602 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1603 {
1604 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1605 if (ch < 256 && unicode_latin1[ch] == unicode)
1606 return 1;
1607 }
1608 return 0;
1609}
1610#endif
1611
Alexander Belopolsky40018472011-02-26 01:02:56 +00001612static int
Victor Stinner488fa492011-12-12 00:01:39 +01001613unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001614{
Victor Stinner488fa492011-12-12 00:01:39 +01001615 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001616 if (Py_REFCNT(unicode) != 1)
1617 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001618 if (_PyUnicode_HASH(unicode) != -1)
1619 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_CHECK_INTERNED(unicode))
1621 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001622 if (!PyUnicode_CheckExact(unicode))
1623 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001624#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001625 /* singleton refcount is greater than 1 */
1626 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001627#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 return 1;
1629}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001630
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631static int
1632unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1633{
1634 PyObject *unicode;
1635 Py_ssize_t old_length;
1636
1637 assert(p_unicode != NULL);
1638 unicode = *p_unicode;
1639
1640 assert(unicode != NULL);
1641 assert(PyUnicode_Check(unicode));
1642 assert(0 <= length);
1643
Victor Stinner910337b2011-10-03 03:20:16 +02001644 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001645 old_length = PyUnicode_WSTR_LENGTH(unicode);
1646 else
1647 old_length = PyUnicode_GET_LENGTH(unicode);
1648 if (old_length == length)
1649 return 0;
1650
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001651 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001652 _Py_INCREF_UNICODE_EMPTY();
1653 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001654 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 Py_DECREF(*p_unicode);
1656 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001657 return 0;
1658 }
1659
Victor Stinner488fa492011-12-12 00:01:39 +01001660 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001661 PyObject *copy = resize_copy(unicode, length);
1662 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001663 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001664 Py_DECREF(*p_unicode);
1665 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001666 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001667 }
1668
Victor Stinnerfe226c02011-10-03 03:52:20 +02001669 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001670 PyObject *new_unicode = resize_compact(unicode, length);
1671 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001672 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001673 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001674 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001675 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001676 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001677}
1678
Alexander Belopolsky40018472011-02-26 01:02:56 +00001679int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001681{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001682 PyObject *unicode;
1683 if (p_unicode == NULL) {
1684 PyErr_BadInternalCall();
1685 return -1;
1686 }
1687 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001688 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001689 {
1690 PyErr_BadInternalCall();
1691 return -1;
1692 }
1693 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001694}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001695
Victor Stinnerc5166102012-02-22 13:55:02 +01001696/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001697
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001698 WARNING: The function doesn't copy the terminating null character and
1699 doesn't check the maximum character (may write a latin1 character in an
1700 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001701static void
1702unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1703 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001704{
1705 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1706 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001707 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001708
1709 switch (kind) {
1710 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001711 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001712#ifdef Py_DEBUG
1713 if (PyUnicode_IS_ASCII(unicode)) {
1714 Py_UCS4 maxchar = ucs1lib_find_max_char(
1715 (const Py_UCS1*)str,
1716 (const Py_UCS1*)str + len);
1717 assert(maxchar < 128);
1718 }
1719#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001720 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001721 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001722 }
1723 case PyUnicode_2BYTE_KIND: {
1724 Py_UCS2 *start = (Py_UCS2 *)data + index;
1725 Py_UCS2 *ucs2 = start;
1726 assert(index <= PyUnicode_GET_LENGTH(unicode));
1727
Victor Stinner184252a2012-06-16 02:57:41 +02001728 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001729 *ucs2 = (Py_UCS2)*str;
1730
1731 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001732 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 }
1734 default: {
1735 Py_UCS4 *start = (Py_UCS4 *)data + index;
1736 Py_UCS4 *ucs4 = start;
1737 assert(kind == PyUnicode_4BYTE_KIND);
1738 assert(index <= PyUnicode_GET_LENGTH(unicode));
1739
Victor Stinner184252a2012-06-16 02:57:41 +02001740 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001741 *ucs4 = (Py_UCS4)*str;
1742
1743 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001744 }
1745 }
1746}
1747
1748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749static PyObject*
1750get_latin1_char(unsigned char ch)
1751{
Victor Stinnera464fc12011-10-02 20:39:30 +02001752 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001754 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 if (!unicode)
1756 return NULL;
1757 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001758 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 unicode_latin1[ch] = unicode;
1760 }
1761 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001762 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763}
1764
Alexander Belopolsky40018472011-02-26 01:02:56 +00001765PyObject *
1766PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001768 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 Py_UCS4 maxchar = 0;
1770 Py_ssize_t num_surrogates;
1771
1772 if (u == NULL)
1773 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001775 /* If the Unicode data is known at construction time, we can apply
1776 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001779 if (size == 0)
1780 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 /* Single character Unicode objects in the Latin-1 range are
1783 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001784 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 return get_latin1_char((unsigned char)*u);
1786
1787 /* If not empty and not single character, copy the Unicode data
1788 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001789 if (find_maxchar_surrogates(u, u + size,
1790 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 return NULL;
1792
Victor Stinner8faf8212011-12-08 22:14:11 +01001793 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 if (!unicode)
1795 return NULL;
1796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 switch (PyUnicode_KIND(unicode)) {
1798 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001799 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1801 break;
1802 case PyUnicode_2BYTE_KIND:
1803#if Py_UNICODE_SIZE == 2
1804 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1805#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001806 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1808#endif
1809 break;
1810 case PyUnicode_4BYTE_KIND:
1811#if SIZEOF_WCHAR_T == 2
1812 /* This is the only case which has to process surrogates, thus
1813 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001814 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815#else
1816 assert(num_surrogates == 0);
1817 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1818#endif
1819 break;
1820 default:
1821 assert(0 && "Impossible state");
1822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001824 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825}
1826
Alexander Belopolsky40018472011-02-26 01:02:56 +00001827PyObject *
1828PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001829{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001830 if (size < 0) {
1831 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001832 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001833 return NULL;
1834 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001835 if (u != NULL)
1836 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1837 else
1838 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001839}
1840
Alexander Belopolsky40018472011-02-26 01:02:56 +00001841PyObject *
1842PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001843{
1844 size_t size = strlen(u);
1845 if (size > PY_SSIZE_T_MAX) {
1846 PyErr_SetString(PyExc_OverflowError, "input too long");
1847 return NULL;
1848 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001849 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001850}
1851
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001852PyObject *
1853_PyUnicode_FromId(_Py_Identifier *id)
1854{
1855 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001856 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1857 strlen(id->string),
1858 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001859 if (!id->object)
1860 return NULL;
1861 PyUnicode_InternInPlace(&id->object);
1862 assert(!id->next);
1863 id->next = static_strings;
1864 static_strings = id;
1865 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001866 return id->object;
1867}
1868
1869void
1870_PyUnicode_ClearStaticStrings()
1871{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001872 _Py_Identifier *tmp, *s = static_strings;
1873 while (s) {
1874 Py_DECREF(s->object);
1875 s->object = NULL;
1876 tmp = s->next;
1877 s->next = NULL;
1878 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001879 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001880 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001881}
1882
Benjamin Peterson0df54292012-03-26 14:50:32 -04001883/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001884
Victor Stinnerd3f08822012-05-29 12:57:52 +02001885PyObject*
1886_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001887{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001888 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001889 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001890 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001891#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001892 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001893#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001894 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001895 }
Victor Stinner785938e2011-12-11 20:09:03 +01001896 unicode = PyUnicode_New(size, 127);
1897 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001898 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001899 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1900 assert(_PyUnicode_CheckConsistency(unicode, 1));
1901 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001902}
1903
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001904static Py_UCS4
1905kind_maxchar_limit(unsigned int kind)
1906{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001907 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001908 case PyUnicode_1BYTE_KIND:
1909 return 0x80;
1910 case PyUnicode_2BYTE_KIND:
1911 return 0x100;
1912 case PyUnicode_4BYTE_KIND:
1913 return 0x10000;
1914 default:
1915 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001916 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001917 }
1918}
1919
Victor Stinnere6abb482012-05-02 01:15:40 +02001920Py_LOCAL_INLINE(Py_UCS4)
1921align_maxchar(Py_UCS4 maxchar)
1922{
1923 if (maxchar <= 127)
1924 return 127;
1925 else if (maxchar <= 255)
1926 return 255;
1927 else if (maxchar <= 65535)
1928 return 65535;
1929 else
1930 return MAX_UNICODE;
1931}
1932
Victor Stinner702c7342011-10-05 13:50:52 +02001933static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001934_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001935{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001937 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938
Serhiy Storchaka678db842013-01-26 12:16:36 +02001939 if (size == 0)
1940 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001941 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001942 if (size == 1)
1943 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001944
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001945 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001946 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 if (!res)
1948 return NULL;
1949 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001950 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001951 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001952}
1953
Victor Stinnere57b1c02011-09-28 22:20:48 +02001954static PyObject*
1955_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956{
1957 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001958 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001959
Serhiy Storchaka678db842013-01-26 12:16:36 +02001960 if (size == 0)
1961 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001962 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001963 if (size == 1) {
1964 Py_UCS4 ch = u[0];
1965 if (ch < 256)
1966 return get_latin1_char((unsigned char)ch);
1967
1968 res = PyUnicode_New(1, ch);
1969 if (res == NULL)
1970 return NULL;
1971 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1972 assert(_PyUnicode_CheckConsistency(res, 1));
1973 return res;
1974 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001977 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (!res)
1979 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001982 else {
1983 _PyUnicode_CONVERT_BYTES(
1984 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1985 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001986 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 return res;
1988}
1989
Victor Stinnere57b1c02011-09-28 22:20:48 +02001990static PyObject*
1991_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992{
1993 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001994 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001995
Serhiy Storchaka678db842013-01-26 12:16:36 +02001996 if (size == 0)
1997 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001998 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001999 if (size == 1) {
2000 Py_UCS4 ch = u[0];
2001 if (ch < 256)
2002 return get_latin1_char((unsigned char)ch);
2003
2004 res = PyUnicode_New(1, ch);
2005 if (res == NULL)
2006 return NULL;
2007 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
2008 assert(_PyUnicode_CheckConsistency(res, 1));
2009 return res;
2010 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002011
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002012 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002013 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 if (!res)
2015 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002016 if (max_char < 256)
2017 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2018 PyUnicode_1BYTE_DATA(res));
2019 else if (max_char < 0x10000)
2020 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2021 PyUnicode_2BYTE_DATA(res));
2022 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002024 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 return res;
2026}
2027
2028PyObject*
2029PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2030{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002031 if (size < 0) {
2032 PyErr_SetString(PyExc_ValueError, "size must be positive");
2033 return NULL;
2034 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002035 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002037 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002039 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002041 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002042 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002043 PyErr_SetString(PyExc_SystemError, "invalid kind");
2044 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046}
2047
Victor Stinnerece58de2012-04-23 23:36:38 +02002048Py_UCS4
2049_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2050{
2051 enum PyUnicode_Kind kind;
2052 void *startptr, *endptr;
2053
2054 assert(PyUnicode_IS_READY(unicode));
2055 assert(0 <= start);
2056 assert(end <= PyUnicode_GET_LENGTH(unicode));
2057 assert(start <= end);
2058
2059 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2060 return PyUnicode_MAX_CHAR_VALUE(unicode);
2061
2062 if (start == end)
2063 return 127;
2064
Victor Stinner94d558b2012-04-27 22:26:58 +02002065 if (PyUnicode_IS_ASCII(unicode))
2066 return 127;
2067
Victor Stinnerece58de2012-04-23 23:36:38 +02002068 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002069 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002070 endptr = (char *)startptr + end * kind;
2071 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002072 switch(kind) {
2073 case PyUnicode_1BYTE_KIND:
2074 return ucs1lib_find_max_char(startptr, endptr);
2075 case PyUnicode_2BYTE_KIND:
2076 return ucs2lib_find_max_char(startptr, endptr);
2077 case PyUnicode_4BYTE_KIND:
2078 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002079 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002080 assert(0);
2081 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002082 }
2083}
2084
Victor Stinner25a4b292011-10-06 12:31:55 +02002085/* Ensure that a string uses the most efficient storage, if it is not the
2086 case: create a new string with of the right kind. Write NULL into *p_unicode
2087 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002088static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002089unicode_adjust_maxchar(PyObject **p_unicode)
2090{
2091 PyObject *unicode, *copy;
2092 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002093 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002094 unsigned int kind;
2095
2096 assert(p_unicode != NULL);
2097 unicode = *p_unicode;
2098 assert(PyUnicode_IS_READY(unicode));
2099 if (PyUnicode_IS_ASCII(unicode))
2100 return;
2101
2102 len = PyUnicode_GET_LENGTH(unicode);
2103 kind = PyUnicode_KIND(unicode);
2104 if (kind == PyUnicode_1BYTE_KIND) {
2105 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002106 max_char = ucs1lib_find_max_char(u, u + len);
2107 if (max_char >= 128)
2108 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002109 }
2110 else if (kind == PyUnicode_2BYTE_KIND) {
2111 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002112 max_char = ucs2lib_find_max_char(u, u + len);
2113 if (max_char >= 256)
2114 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002115 }
2116 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002117 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002118 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002119 max_char = ucs4lib_find_max_char(u, u + len);
2120 if (max_char >= 0x10000)
2121 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002123 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002124 if (copy != NULL)
2125 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002126 Py_DECREF(unicode);
2127 *p_unicode = copy;
2128}
2129
Victor Stinner034f6cf2011-09-30 02:26:44 +02002130PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002131_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002132{
Victor Stinner87af4f22011-11-21 23:03:47 +01002133 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002134 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002135
Victor Stinner034f6cf2011-09-30 02:26:44 +02002136 if (!PyUnicode_Check(unicode)) {
2137 PyErr_BadInternalCall();
2138 return NULL;
2139 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002140 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002141 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002142
Victor Stinner87af4f22011-11-21 23:03:47 +01002143 length = PyUnicode_GET_LENGTH(unicode);
2144 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145 if (!copy)
2146 return NULL;
2147 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2148
Victor Stinner87af4f22011-11-21 23:03:47 +01002149 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2150 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002151 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153}
2154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155
Victor Stinnerbc603d12011-10-02 01:00:40 +02002156/* Widen Unicode objects to larger buffers. Don't write terminating null
2157 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158
2159void*
2160_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2161{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002162 Py_ssize_t len;
2163 void *result;
2164 unsigned int skind;
2165
Benjamin Petersonbac79492012-01-14 13:34:47 -05002166 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167 return NULL;
2168
2169 len = PyUnicode_GET_LENGTH(s);
2170 skind = PyUnicode_KIND(s);
2171 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002172 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return NULL;
2174 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002175 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002176 case PyUnicode_2BYTE_KIND:
2177 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2178 if (!result)
2179 return PyErr_NoMemory();
2180 assert(skind == PyUnicode_1BYTE_KIND);
2181 _PyUnicode_CONVERT_BYTES(
2182 Py_UCS1, Py_UCS2,
2183 PyUnicode_1BYTE_DATA(s),
2184 PyUnicode_1BYTE_DATA(s) + len,
2185 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002187 case PyUnicode_4BYTE_KIND:
2188 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2189 if (!result)
2190 return PyErr_NoMemory();
2191 if (skind == PyUnicode_2BYTE_KIND) {
2192 _PyUnicode_CONVERT_BYTES(
2193 Py_UCS2, Py_UCS4,
2194 PyUnicode_2BYTE_DATA(s),
2195 PyUnicode_2BYTE_DATA(s) + len,
2196 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002198 else {
2199 assert(skind == PyUnicode_1BYTE_KIND);
2200 _PyUnicode_CONVERT_BYTES(
2201 Py_UCS1, Py_UCS4,
2202 PyUnicode_1BYTE_DATA(s),
2203 PyUnicode_1BYTE_DATA(s) + len,
2204 result);
2205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002207 default:
2208 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 }
Victor Stinner01698042011-10-04 00:04:26 +02002210 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 return NULL;
2212}
2213
2214static Py_UCS4*
2215as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2216 int copy_null)
2217{
2218 int kind;
2219 void *data;
2220 Py_ssize_t len, targetlen;
2221 if (PyUnicode_READY(string) == -1)
2222 return NULL;
2223 kind = PyUnicode_KIND(string);
2224 data = PyUnicode_DATA(string);
2225 len = PyUnicode_GET_LENGTH(string);
2226 targetlen = len;
2227 if (copy_null)
2228 targetlen++;
2229 if (!target) {
2230 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2231 PyErr_NoMemory();
2232 return NULL;
2233 }
2234 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2235 if (!target) {
2236 PyErr_NoMemory();
2237 return NULL;
2238 }
2239 }
2240 else {
2241 if (targetsize < targetlen) {
2242 PyErr_Format(PyExc_SystemError,
2243 "string is longer than the buffer");
2244 if (copy_null && 0 < targetsize)
2245 target[0] = 0;
2246 return NULL;
2247 }
2248 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002249 if (kind == PyUnicode_1BYTE_KIND) {
2250 Py_UCS1 *start = (Py_UCS1 *) data;
2251 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002253 else if (kind == PyUnicode_2BYTE_KIND) {
2254 Py_UCS2 *start = (Py_UCS2 *) data;
2255 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2256 }
2257 else {
2258 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 if (copy_null)
2262 target[len] = 0;
2263 return target;
2264}
2265
2266Py_UCS4*
2267PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2268 int copy_null)
2269{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002270 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 PyErr_BadInternalCall();
2272 return NULL;
2273 }
2274 return as_ucs4(string, target, targetsize, copy_null);
2275}
2276
2277Py_UCS4*
2278PyUnicode_AsUCS4Copy(PyObject *string)
2279{
2280 return as_ucs4(string, NULL, 0, 1);
2281}
2282
2283#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002284
Alexander Belopolsky40018472011-02-26 01:02:56 +00002285PyObject *
2286PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002290 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002291 PyErr_BadInternalCall();
2292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 }
2294
Martin v. Löwis790465f2008-04-05 20:41:37 +00002295 if (size == -1) {
2296 size = wcslen(w);
2297 }
2298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002299 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300}
2301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002303
Walter Dörwald346737f2007-05-31 10:44:43 +00002304static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002305makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002306 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002307{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002308 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002309 if (longflag)
2310 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002311 else if (longlongflag) {
2312 /* longlongflag should only ever be nonzero on machines with
2313 HAVE_LONG_LONG defined */
2314#ifdef HAVE_LONG_LONG
2315 char *f = PY_FORMAT_LONG_LONG;
2316 while (*f)
2317 *fmt++ = *f++;
2318#else
2319 /* we shouldn't ever get here */
2320 assert(0);
2321 *fmt++ = 'l';
2322#endif
2323 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002324 else if (size_tflag) {
2325 char *f = PY_FORMAT_SIZE_T;
2326 while (*f)
2327 *fmt++ = *f++;
2328 }
2329 *fmt++ = c;
2330 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002331}
2332
Victor Stinner15a11362012-10-06 23:48:20 +02002333/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002334 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2335 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2336#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002337
2338static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002339unicode_fromformat_arg(_PyUnicodeWriter *writer,
2340 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002341{
Victor Stinnere215d962012-10-06 23:03:36 +02002342 const char *p;
2343 Py_ssize_t len;
2344 int zeropad;
2345 int width;
2346 int precision;
2347 int longflag;
2348 int longlongflag;
2349 int size_tflag;
2350 int fill;
2351
2352 p = f;
2353 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002354 zeropad = 0;
2355 if (*f == '0') {
2356 zeropad = 1;
2357 f++;
2358 }
Victor Stinner96865452011-03-01 23:44:09 +00002359
2360 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002361 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002362 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002363 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2364 PyErr_SetString(PyExc_ValueError,
2365 "width too big");
2366 return NULL;
2367 }
Victor Stinnere215d962012-10-06 23:03:36 +02002368 width = (width*10) + (*f - '0');
2369 f++;
2370 }
Victor Stinner96865452011-03-01 23:44:09 +00002371 precision = 0;
2372 if (*f == '.') {
2373 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002374 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002375 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2376 PyErr_SetString(PyExc_ValueError,
2377 "precision too big");
2378 return NULL;
2379 }
Victor Stinnere215d962012-10-06 23:03:36 +02002380 precision = (precision*10) + (*f - '0');
2381 f++;
2382 }
Victor Stinner96865452011-03-01 23:44:09 +00002383 if (*f == '%') {
2384 /* "%.3%s" => f points to "3" */
2385 f--;
2386 }
2387 }
2388 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002389 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002390 f--;
2391 }
Victor Stinner96865452011-03-01 23:44:09 +00002392
2393 /* Handle %ld, %lu, %lld and %llu. */
2394 longflag = 0;
2395 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002396 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002397 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002398 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002399 longflag = 1;
2400 ++f;
2401 }
2402#ifdef HAVE_LONG_LONG
2403 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002404 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002405 longlongflag = 1;
2406 f += 2;
2407 }
2408#endif
2409 }
2410 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002411 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002412 size_tflag = 1;
2413 ++f;
2414 }
Victor Stinnere215d962012-10-06 23:03:36 +02002415
2416 if (f[1] == '\0')
2417 writer->overallocate = 0;
2418
2419 switch (*f) {
2420 case 'c':
2421 {
2422 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002423 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2424 PyErr_SetString(PyExc_ValueError,
2425 "character argument not in range(0x110000)");
2426 return NULL;
2427 }
Victor Stinnere215d962012-10-06 23:03:36 +02002428 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2429 return NULL;
2430 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2431 writer->pos++;
2432 break;
2433 }
2434
2435 case 'i':
2436 case 'd':
2437 case 'u':
2438 case 'x':
2439 {
2440 /* used by sprintf */
2441 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002442 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002443
2444 if (*f == 'u') {
2445 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2446
2447 if (longflag)
2448 len = sprintf(buffer, fmt,
2449 va_arg(*vargs, unsigned long));
2450#ifdef HAVE_LONG_LONG
2451 else if (longlongflag)
2452 len = sprintf(buffer, fmt,
2453 va_arg(*vargs, unsigned PY_LONG_LONG));
2454#endif
2455 else if (size_tflag)
2456 len = sprintf(buffer, fmt,
2457 va_arg(*vargs, size_t));
2458 else
2459 len = sprintf(buffer, fmt,
2460 va_arg(*vargs, unsigned int));
2461 }
2462 else if (*f == 'x') {
2463 makefmt(fmt, 0, 0, 0, 'x');
2464 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2465 }
2466 else {
2467 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2468
2469 if (longflag)
2470 len = sprintf(buffer, fmt,
2471 va_arg(*vargs, long));
2472#ifdef HAVE_LONG_LONG
2473 else if (longlongflag)
2474 len = sprintf(buffer, fmt,
2475 va_arg(*vargs, PY_LONG_LONG));
2476#endif
2477 else if (size_tflag)
2478 len = sprintf(buffer, fmt,
2479 va_arg(*vargs, Py_ssize_t));
2480 else
2481 len = sprintf(buffer, fmt,
2482 va_arg(*vargs, int));
2483 }
2484 assert(len >= 0);
2485
Victor Stinnere215d962012-10-06 23:03:36 +02002486 if (precision < len)
2487 precision = len;
2488 if (width > precision) {
2489 Py_UCS4 fillchar;
2490 fill = width - precision;
2491 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002492 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2493 return NULL;
2494 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2495 return NULL;
2496 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002497 }
Victor Stinner15a11362012-10-06 23:48:20 +02002498 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002499 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002500 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2501 return NULL;
2502 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2503 return NULL;
2504 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002505 }
Victor Stinner15a11362012-10-06 23:48:20 +02002506 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002507 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002508 break;
2509 }
2510
2511 case 'p':
2512 {
2513 char number[MAX_LONG_LONG_CHARS];
2514
2515 len = sprintf(number, "%p", va_arg(*vargs, void*));
2516 assert(len >= 0);
2517
2518 /* %p is ill-defined: ensure leading 0x. */
2519 if (number[1] == 'X')
2520 number[1] = 'x';
2521 else if (number[1] != 'x') {
2522 memmove(number + 2, number,
2523 strlen(number) + 1);
2524 number[0] = '0';
2525 number[1] = 'x';
2526 len += 2;
2527 }
2528
2529 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2530 return NULL;
2531 break;
2532 }
2533
2534 case 's':
2535 {
2536 /* UTF-8 */
2537 const char *s = va_arg(*vargs, const char*);
2538 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2539 if (!str)
2540 return NULL;
2541 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2542 Py_DECREF(str);
2543 return NULL;
2544 }
2545 Py_DECREF(str);
2546 break;
2547 }
2548
2549 case 'U':
2550 {
2551 PyObject *obj = va_arg(*vargs, PyObject *);
2552 assert(obj && _PyUnicode_CHECK(obj));
2553
2554 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2555 return NULL;
2556 break;
2557 }
2558
2559 case 'V':
2560 {
2561 PyObject *obj = va_arg(*vargs, PyObject *);
2562 const char *str = va_arg(*vargs, const char *);
2563 PyObject *str_obj;
2564 assert(obj || str);
2565 if (obj) {
2566 assert(_PyUnicode_CHECK(obj));
2567 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2568 return NULL;
2569 }
2570 else {
2571 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2572 if (!str_obj)
2573 return NULL;
2574 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2575 Py_DECREF(str_obj);
2576 return NULL;
2577 }
2578 Py_DECREF(str_obj);
2579 }
2580 break;
2581 }
2582
2583 case 'S':
2584 {
2585 PyObject *obj = va_arg(*vargs, PyObject *);
2586 PyObject *str;
2587 assert(obj);
2588 str = PyObject_Str(obj);
2589 if (!str)
2590 return NULL;
2591 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2592 Py_DECREF(str);
2593 return NULL;
2594 }
2595 Py_DECREF(str);
2596 break;
2597 }
2598
2599 case 'R':
2600 {
2601 PyObject *obj = va_arg(*vargs, PyObject *);
2602 PyObject *repr;
2603 assert(obj);
2604 repr = PyObject_Repr(obj);
2605 if (!repr)
2606 return NULL;
2607 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2608 Py_DECREF(repr);
2609 return NULL;
2610 }
2611 Py_DECREF(repr);
2612 break;
2613 }
2614
2615 case 'A':
2616 {
2617 PyObject *obj = va_arg(*vargs, PyObject *);
2618 PyObject *ascii;
2619 assert(obj);
2620 ascii = PyObject_ASCII(obj);
2621 if (!ascii)
2622 return NULL;
2623 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2624 Py_DECREF(ascii);
2625 return NULL;
2626 }
2627 Py_DECREF(ascii);
2628 break;
2629 }
2630
2631 case '%':
2632 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2633 return NULL;
2634 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2635 writer->pos++;
2636 break;
2637
2638 default:
2639 /* if we stumble upon an unknown formatting code, copy the rest
2640 of the format string to the output string. (we cannot just
2641 skip the code, since there's no way to know what's in the
2642 argument list) */
2643 len = strlen(p);
2644 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2645 return NULL;
2646 f = p+len;
2647 return f;
2648 }
2649
2650 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002651 return f;
2652}
2653
Walter Dörwaldd2034312007-05-18 16:29:38 +00002654PyObject *
2655PyUnicode_FromFormatV(const char *format, va_list vargs)
2656{
Victor Stinnere215d962012-10-06 23:03:36 +02002657 va_list vargs2;
2658 const char *f;
2659 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002660
Victor Stinnere215d962012-10-06 23:03:36 +02002661 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2662
2663 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2664 Copy it to be able to pass a reference to a subfunction. */
2665 Py_VA_COPY(vargs2, vargs);
2666
2667 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002669 f = unicode_fromformat_arg(&writer, f, &vargs2);
2670 if (f == NULL)
2671 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002674 const char *p;
2675 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002676
Victor Stinnere215d962012-10-06 23:03:36 +02002677 p = f;
2678 do
2679 {
2680 if ((unsigned char)*p > 127) {
2681 PyErr_Format(PyExc_ValueError,
2682 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2683 "string, got a non-ASCII byte: 0x%02x",
2684 (unsigned char)*p);
2685 return NULL;
2686 }
2687 p++;
2688 }
2689 while (*p != '\0' && *p != '%');
2690 len = p - f;
2691
2692 if (*p == '\0')
2693 writer.overallocate = 0;
2694 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2695 goto fail;
2696 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2697 writer.pos += len;
2698
2699 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002701 }
Victor Stinnere215d962012-10-06 23:03:36 +02002702 return _PyUnicodeWriter_Finish(&writer);
2703
2704 fail:
2705 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707}
2708
Walter Dörwaldd2034312007-05-18 16:29:38 +00002709PyObject *
2710PyUnicode_FromFormat(const char *format, ...)
2711{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 PyObject* ret;
2713 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002714
2715#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002720 ret = PyUnicode_FromFormatV(format, vargs);
2721 va_end(vargs);
2722 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002723}
2724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725#ifdef HAVE_WCHAR_H
2726
Victor Stinner5593d8a2010-10-02 11:11:27 +00002727/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2728 convert a Unicode object to a wide character string.
2729
Victor Stinnerd88d9832011-09-06 02:00:05 +02002730 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 character) required to convert the unicode object. Ignore size argument.
2732
Victor Stinnerd88d9832011-09-06 02:00:05 +02002733 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002735 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002737unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002738 wchar_t *w,
2739 Py_ssize_t size)
2740{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002741 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 const wchar_t *wstr;
2743
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002744 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 if (wstr == NULL)
2746 return -1;
2747
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002749 if (size > res)
2750 size = res + 1;
2751 else
2752 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002754 return res;
2755 }
2756 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002757 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002758}
2759
2760Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002761PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002762 wchar_t *w,
2763 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764{
2765 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 PyErr_BadInternalCall();
2767 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002769 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770}
2771
Victor Stinner137c34c2010-09-29 10:25:54 +00002772wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002773PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002774 Py_ssize_t *size)
2775{
2776 wchar_t* buffer;
2777 Py_ssize_t buflen;
2778
2779 if (unicode == NULL) {
2780 PyErr_BadInternalCall();
2781 return NULL;
2782 }
2783
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002784 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 if (buflen == -1)
2786 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002787 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002788 PyErr_NoMemory();
2789 return NULL;
2790 }
2791
Victor Stinner137c34c2010-09-29 10:25:54 +00002792 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2793 if (buffer == NULL) {
2794 PyErr_NoMemory();
2795 return NULL;
2796 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002797 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002798 if (buflen == -1) {
2799 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002801 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802 if (size != NULL)
2803 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002804 return buffer;
2805}
2806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808
Alexander Belopolsky40018472011-02-26 01:02:56 +00002809PyObject *
2810PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002813 void *data;
2814 int kind;
2815
Victor Stinner8faf8212011-12-08 22:14:11 +01002816 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002817 PyErr_SetString(PyExc_ValueError,
2818 "chr() arg not in range(0x110000)");
2819 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002820 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002821
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002822 if ((Py_UCS4)ordinal < 256)
2823 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 v = PyUnicode_New(1, ordinal);
2826 if (v == NULL)
2827 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002828 kind = PyUnicode_KIND(v);
2829 data = PyUnicode_DATA(v);
2830 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002831 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002833}
2834
Alexander Belopolsky40018472011-02-26 01:02:56 +00002835PyObject *
2836PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002838 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002840 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002841 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002842 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002843 Py_INCREF(obj);
2844 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002845 }
2846 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 /* For a Unicode subtype that's not a Unicode object,
2848 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002849 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002850 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002851 PyErr_Format(PyExc_TypeError,
2852 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002853 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002854 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002855}
2856
Alexander Belopolsky40018472011-02-26 01:02:56 +00002857PyObject *
2858PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002859 const char *encoding,
2860 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002861{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002863 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002864
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002866 PyErr_BadInternalCall();
2867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002869
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002870 /* Decoding bytes objects is the most common case and should be fast */
2871 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002872 if (PyBytes_GET_SIZE(obj) == 0)
2873 _Py_RETURN_UNICODE_EMPTY();
2874 v = PyUnicode_Decode(
2875 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2876 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 return v;
2878 }
2879
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002880 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 PyErr_SetString(PyExc_TypeError,
2882 "decoding str is not supported");
2883 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002884 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2887 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2888 PyErr_Format(PyExc_TypeError,
2889 "coercing to str: need bytes, bytearray "
2890 "or buffer-like object, %.80s found",
2891 Py_TYPE(obj)->tp_name);
2892 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002893 }
Tim Petersced69f82003-09-16 20:30:58 +00002894
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002896 PyBuffer_Release(&buffer);
2897 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002899
Serhiy Storchaka05997252013-01-26 12:14:02 +02002900 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002901 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002902 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903}
2904
Victor Stinner600d3be2010-06-10 12:00:55 +00002905/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002906 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2907 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002908int
2909_Py_normalize_encoding(const char *encoding,
2910 char *lower,
2911 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002913 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002914 char *l;
2915 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002917 if (encoding == NULL) {
2918 strcpy(lower, "utf-8");
2919 return 1;
2920 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002921 e = encoding;
2922 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002923 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002924 while (*e) {
2925 if (l == l_end)
2926 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002927 if (Py_ISUPPER(*e)) {
2928 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002929 }
2930 else if (*e == '_') {
2931 *l++ = '-';
2932 e++;
2933 }
2934 else {
2935 *l++ = *e++;
2936 }
2937 }
2938 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002939 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002940}
2941
Alexander Belopolsky40018472011-02-26 01:02:56 +00002942PyObject *
2943PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002944 Py_ssize_t size,
2945 const char *encoding,
2946 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002947{
2948 PyObject *buffer = NULL, *unicode;
2949 Py_buffer info;
2950 char lower[11]; /* Enough for any encoding shortcut */
2951
Fred Drakee4315f52000-05-09 19:53:39 +00002952 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002953 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002954 if ((strcmp(lower, "utf-8") == 0) ||
2955 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002956 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002957 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002958 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002959 (strcmp(lower, "iso-8859-1") == 0))
2960 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002961#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002962 else if (strcmp(lower, "mbcs") == 0)
2963 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002964#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002965 else if (strcmp(lower, "ascii") == 0)
2966 return PyUnicode_DecodeASCII(s, size, errors);
2967 else if (strcmp(lower, "utf-16") == 0)
2968 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2969 else if (strcmp(lower, "utf-32") == 0)
2970 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972
2973 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002974 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002975 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002976 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002977 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 if (buffer == NULL)
2979 goto onError;
2980 unicode = PyCodec_Decode(buffer, encoding, errors);
2981 if (unicode == NULL)
2982 goto onError;
2983 if (!PyUnicode_Check(unicode)) {
2984 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002985 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002986 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 Py_DECREF(unicode);
2988 goto onError;
2989 }
2990 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002991 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002992
Benjamin Peterson29060642009-01-31 22:14:21 +00002993 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 Py_XDECREF(buffer);
2995 return NULL;
2996}
2997
Alexander Belopolsky40018472011-02-26 01:02:56 +00002998PyObject *
2999PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003000 const char *encoding,
3001 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003002{
3003 PyObject *v;
3004
3005 if (!PyUnicode_Check(unicode)) {
3006 PyErr_BadArgument();
3007 goto onError;
3008 }
3009
3010 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012
3013 /* Decode via the codec registry */
3014 v = PyCodec_Decode(unicode, encoding, errors);
3015 if (v == NULL)
3016 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003017 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003018
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003020 return NULL;
3021}
3022
Alexander Belopolsky40018472011-02-26 01:02:56 +00003023PyObject *
3024PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003025 const char *encoding,
3026 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003027{
3028 PyObject *v;
3029
3030 if (!PyUnicode_Check(unicode)) {
3031 PyErr_BadArgument();
3032 goto onError;
3033 }
3034
3035 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037
3038 /* Decode via the codec registry */
3039 v = PyCodec_Decode(unicode, encoding, errors);
3040 if (v == NULL)
3041 goto onError;
3042 if (!PyUnicode_Check(v)) {
3043 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003044 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003045 Py_TYPE(v)->tp_name);
3046 Py_DECREF(v);
3047 goto onError;
3048 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003049 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003050
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003052 return NULL;
3053}
3054
Alexander Belopolsky40018472011-02-26 01:02:56 +00003055PyObject *
3056PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003057 Py_ssize_t size,
3058 const char *encoding,
3059 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060{
3061 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003062
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 unicode = PyUnicode_FromUnicode(s, size);
3064 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3067 Py_DECREF(unicode);
3068 return v;
3069}
3070
Alexander Belopolsky40018472011-02-26 01:02:56 +00003071PyObject *
3072PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003073 const char *encoding,
3074 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003075{
3076 PyObject *v;
3077
3078 if (!PyUnicode_Check(unicode)) {
3079 PyErr_BadArgument();
3080 goto onError;
3081 }
3082
3083 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003085
3086 /* Encode via the codec registry */
3087 v = PyCodec_Encode(unicode, encoding, errors);
3088 if (v == NULL)
3089 goto onError;
3090 return v;
3091
Benjamin Peterson29060642009-01-31 22:14:21 +00003092 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003093 return NULL;
3094}
3095
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003096static size_t
3097wcstombs_errorpos(const wchar_t *wstr)
3098{
3099 size_t len;
3100#if SIZEOF_WCHAR_T == 2
3101 wchar_t buf[3];
3102#else
3103 wchar_t buf[2];
3104#endif
3105 char outbuf[MB_LEN_MAX];
3106 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003107
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003108#if SIZEOF_WCHAR_T == 2
3109 buf[2] = 0;
3110#else
3111 buf[1] = 0;
3112#endif
3113 start = wstr;
3114 while (*wstr != L'\0')
3115 {
3116 previous = wstr;
3117#if SIZEOF_WCHAR_T == 2
3118 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3119 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3120 {
3121 buf[0] = wstr[0];
3122 buf[1] = wstr[1];
3123 wstr += 2;
3124 }
3125 else {
3126 buf[0] = *wstr;
3127 buf[1] = 0;
3128 wstr++;
3129 }
3130#else
3131 buf[0] = *wstr;
3132 wstr++;
3133#endif
3134 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003135 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003136 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003137 }
3138
3139 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003140 return 0;
3141}
3142
Victor Stinner1b579672011-12-17 05:47:23 +01003143static int
3144locale_error_handler(const char *errors, int *surrogateescape)
3145{
3146 if (errors == NULL) {
3147 *surrogateescape = 0;
3148 return 0;
3149 }
3150
3151 if (strcmp(errors, "strict") == 0) {
3152 *surrogateescape = 0;
3153 return 0;
3154 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003155 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003156 *surrogateescape = 1;
3157 return 0;
3158 }
3159 PyErr_Format(PyExc_ValueError,
3160 "only 'strict' and 'surrogateescape' error handlers "
3161 "are supported, not '%s'",
3162 errors);
3163 return -1;
3164}
3165
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003166PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003167PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168{
3169 Py_ssize_t wlen, wlen2;
3170 wchar_t *wstr;
3171 PyObject *bytes = NULL;
3172 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003173 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003174 PyObject *exc;
3175 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003176 int surrogateescape;
3177
3178 if (locale_error_handler(errors, &surrogateescape) < 0)
3179 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003180
3181 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3182 if (wstr == NULL)
3183 return NULL;
3184
3185 wlen2 = wcslen(wstr);
3186 if (wlen2 != wlen) {
3187 PyMem_Free(wstr);
3188 PyErr_SetString(PyExc_TypeError, "embedded null character");
3189 return NULL;
3190 }
3191
3192 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003193 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003194 char *str;
3195
3196 str = _Py_wchar2char(wstr, &error_pos);
3197 if (str == NULL) {
3198 if (error_pos == (size_t)-1) {
3199 PyErr_NoMemory();
3200 PyMem_Free(wstr);
3201 return NULL;
3202 }
3203 else {
3204 goto encode_error;
3205 }
3206 }
3207 PyMem_Free(wstr);
3208
3209 bytes = PyBytes_FromString(str);
3210 PyMem_Free(str);
3211 }
3212 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003213 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003214 size_t len, len2;
3215
3216 len = wcstombs(NULL, wstr, 0);
3217 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003218 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003219 goto encode_error;
3220 }
3221
3222 bytes = PyBytes_FromStringAndSize(NULL, len);
3223 if (bytes == NULL) {
3224 PyMem_Free(wstr);
3225 return NULL;
3226 }
3227
3228 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3229 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003230 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003231 goto encode_error;
3232 }
3233 PyMem_Free(wstr);
3234 }
3235 return bytes;
3236
3237encode_error:
3238 errmsg = strerror(errno);
3239 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003240
3241 if (error_pos == (size_t)-1)
3242 error_pos = wcstombs_errorpos(wstr);
3243
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003244 PyMem_Free(wstr);
3245 Py_XDECREF(bytes);
3246
Victor Stinner2f197072011-12-17 07:08:30 +01003247 if (errmsg != NULL) {
3248 size_t errlen;
3249 wstr = _Py_char2wchar(errmsg, &errlen);
3250 if (wstr != NULL) {
3251 reason = PyUnicode_FromWideChar(wstr, errlen);
3252 PyMem_Free(wstr);
3253 } else
3254 errmsg = NULL;
3255 }
3256 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003257 reason = PyUnicode_FromString(
3258 "wcstombs() encountered an unencodable "
3259 "wide character");
3260 if (reason == NULL)
3261 return NULL;
3262
3263 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3264 "locale", unicode,
3265 (Py_ssize_t)error_pos,
3266 (Py_ssize_t)(error_pos+1),
3267 reason);
3268 Py_DECREF(reason);
3269 if (exc != NULL) {
3270 PyCodec_StrictErrors(exc);
3271 Py_XDECREF(exc);
3272 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003273 return NULL;
3274}
3275
Victor Stinnerad158722010-10-27 00:25:46 +00003276PyObject *
3277PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003278{
Victor Stinner99b95382011-07-04 14:23:54 +02003279#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003280 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003281#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003282 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003283#else
Victor Stinner793b5312011-04-27 00:24:21 +02003284 PyInterpreterState *interp = PyThreadState_GET()->interp;
3285 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3286 cannot use it to encode and decode filenames before it is loaded. Load
3287 the Python codec requires to encode at least its own filename. Use the C
3288 version of the locale codec until the codec registry is initialized and
3289 the Python codec is loaded.
3290
3291 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3292 cannot only rely on it: check also interp->fscodec_initialized for
3293 subinterpreters. */
3294 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003295 return PyUnicode_AsEncodedString(unicode,
3296 Py_FileSystemDefaultEncoding,
3297 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003298 }
3299 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003300 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003301 }
Victor Stinnerad158722010-10-27 00:25:46 +00003302#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003303}
3304
Alexander Belopolsky40018472011-02-26 01:02:56 +00003305PyObject *
3306PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003307 const char *encoding,
3308 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309{
3310 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003311 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003312
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 if (!PyUnicode_Check(unicode)) {
3314 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 }
Fred Drakee4315f52000-05-09 19:53:39 +00003317
Fred Drakee4315f52000-05-09 19:53:39 +00003318 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003319 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003320 if ((strcmp(lower, "utf-8") == 0) ||
3321 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003322 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003323 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003324 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003325 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003326 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003327 }
Victor Stinner37296e82010-06-10 13:36:23 +00003328 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003329 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003330 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003331 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003332#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003333 else if (strcmp(lower, "mbcs") == 0)
3334 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003335#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003336 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003337 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339
3340 /* Encode via the codec registry */
3341 v = PyCodec_Encode(unicode, encoding, errors);
3342 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003343 return NULL;
3344
3345 /* The normal path */
3346 if (PyBytes_Check(v))
3347 return v;
3348
3349 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003350 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003351 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003352 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003353
3354 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3355 "encoder %s returned bytearray instead of bytes",
3356 encoding);
3357 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003358 Py_DECREF(v);
3359 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003360 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003361
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003362 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3363 Py_DECREF(v);
3364 return b;
3365 }
3366
3367 PyErr_Format(PyExc_TypeError,
3368 "encoder did not return a bytes object (type=%.400s)",
3369 Py_TYPE(v)->tp_name);
3370 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003371 return NULL;
3372}
3373
Alexander Belopolsky40018472011-02-26 01:02:56 +00003374PyObject *
3375PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003376 const char *encoding,
3377 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003378{
3379 PyObject *v;
3380
3381 if (!PyUnicode_Check(unicode)) {
3382 PyErr_BadArgument();
3383 goto onError;
3384 }
3385
3386 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003387 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003388
3389 /* Encode via the codec registry */
3390 v = PyCodec_Encode(unicode, encoding, errors);
3391 if (v == NULL)
3392 goto onError;
3393 if (!PyUnicode_Check(v)) {
3394 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003395 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003396 Py_TYPE(v)->tp_name);
3397 Py_DECREF(v);
3398 goto onError;
3399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003401
Benjamin Peterson29060642009-01-31 22:14:21 +00003402 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 return NULL;
3404}
3405
Victor Stinner2f197072011-12-17 07:08:30 +01003406static size_t
3407mbstowcs_errorpos(const char *str, size_t len)
3408{
3409#ifdef HAVE_MBRTOWC
3410 const char *start = str;
3411 mbstate_t mbs;
3412 size_t converted;
3413 wchar_t ch;
3414
3415 memset(&mbs, 0, sizeof mbs);
3416 while (len)
3417 {
3418 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3419 if (converted == 0)
3420 /* Reached end of string */
3421 break;
3422 if (converted == (size_t)-1 || converted == (size_t)-2) {
3423 /* Conversion error or incomplete character */
3424 return str - start;
3425 }
3426 else {
3427 str += converted;
3428 len -= converted;
3429 }
3430 }
3431 /* failed to find the undecodable byte sequence */
3432 return 0;
3433#endif
3434 return 0;
3435}
3436
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003437PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003438PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003439 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003440{
3441 wchar_t smallbuf[256];
3442 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3443 wchar_t *wstr;
3444 size_t wlen, wlen2;
3445 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003446 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003447 size_t error_pos;
3448 char *errmsg;
3449 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003450
3451 if (locale_error_handler(errors, &surrogateescape) < 0)
3452 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003453
3454 if (str[len] != '\0' || len != strlen(str)) {
3455 PyErr_SetString(PyExc_TypeError, "embedded null character");
3456 return NULL;
3457 }
3458
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003459 if (surrogateescape) {
3460 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003461 wstr = _Py_char2wchar(str, &wlen);
3462 if (wstr == NULL) {
3463 if (wlen == (size_t)-1)
3464 PyErr_NoMemory();
3465 else
3466 PyErr_SetFromErrno(PyExc_OSError);
3467 return NULL;
3468 }
3469
3470 unicode = PyUnicode_FromWideChar(wstr, wlen);
3471 PyMem_Free(wstr);
3472 }
3473 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003474 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003475#ifndef HAVE_BROKEN_MBSTOWCS
3476 wlen = mbstowcs(NULL, str, 0);
3477#else
3478 wlen = len;
3479#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003480 if (wlen == (size_t)-1)
3481 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003482 if (wlen+1 <= smallbuf_len) {
3483 wstr = smallbuf;
3484 }
3485 else {
3486 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3487 return PyErr_NoMemory();
3488
3489 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3490 if (!wstr)
3491 return PyErr_NoMemory();
3492 }
3493
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003494 wlen2 = mbstowcs(wstr, str, wlen+1);
3495 if (wlen2 == (size_t)-1) {
3496 if (wstr != smallbuf)
3497 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003498 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003499 }
3500#ifdef HAVE_BROKEN_MBSTOWCS
3501 assert(wlen2 == wlen);
3502#endif
3503 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3504 if (wstr != smallbuf)
3505 PyMem_Free(wstr);
3506 }
3507 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003508
3509decode_error:
3510 errmsg = strerror(errno);
3511 assert(errmsg != NULL);
3512
3513 error_pos = mbstowcs_errorpos(str, len);
3514 if (errmsg != NULL) {
3515 size_t errlen;
3516 wstr = _Py_char2wchar(errmsg, &errlen);
3517 if (wstr != NULL) {
3518 reason = PyUnicode_FromWideChar(wstr, errlen);
3519 PyMem_Free(wstr);
3520 } else
3521 errmsg = NULL;
3522 }
3523 if (errmsg == NULL)
3524 reason = PyUnicode_FromString(
3525 "mbstowcs() encountered an invalid multibyte sequence");
3526 if (reason == NULL)
3527 return NULL;
3528
3529 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3530 "locale", str, len,
3531 (Py_ssize_t)error_pos,
3532 (Py_ssize_t)(error_pos+1),
3533 reason);
3534 Py_DECREF(reason);
3535 if (exc != NULL) {
3536 PyCodec_StrictErrors(exc);
3537 Py_XDECREF(exc);
3538 }
3539 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003540}
3541
3542PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003543PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003544{
3545 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003546 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547}
3548
3549
3550PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003551PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003552 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003553 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3554}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003555
Christian Heimes5894ba72007-11-04 11:43:14 +00003556PyObject*
3557PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3558{
Victor Stinner99b95382011-07-04 14:23:54 +02003559#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003560 return PyUnicode_DecodeMBCS(s, size, NULL);
3561#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003562 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003563#else
Victor Stinner793b5312011-04-27 00:24:21 +02003564 PyInterpreterState *interp = PyThreadState_GET()->interp;
3565 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3566 cannot use it to encode and decode filenames before it is loaded. Load
3567 the Python codec requires to encode at least its own filename. Use the C
3568 version of the locale codec until the codec registry is initialized and
3569 the Python codec is loaded.
3570
3571 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3572 cannot only rely on it: check also interp->fscodec_initialized for
3573 subinterpreters. */
3574 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003575 return PyUnicode_Decode(s, size,
3576 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003577 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003578 }
3579 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003580 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003581 }
Victor Stinnerad158722010-10-27 00:25:46 +00003582#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003583}
3584
Martin v. Löwis011e8422009-05-05 04:43:17 +00003585
3586int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003587_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003588{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003589 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003590
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003591 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003592 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003593 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3594 PyUnicode_GET_LENGTH(str), '\0', 1);
3595 if (pos == -1)
3596 return 0;
3597 else
3598 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003599}
3600
Antoine Pitrou13348842012-01-29 18:36:34 +01003601int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003602PyUnicode_FSConverter(PyObject* arg, void* addr)
3603{
3604 PyObject *output = NULL;
3605 Py_ssize_t size;
3606 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003607 if (arg == NULL) {
3608 Py_DECREF(*(PyObject**)addr);
3609 return 1;
3610 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003611 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003612 output = arg;
3613 Py_INCREF(output);
3614 }
3615 else {
3616 arg = PyUnicode_FromObject(arg);
3617 if (!arg)
3618 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003619 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620 Py_DECREF(arg);
3621 if (!output)
3622 return 0;
3623 if (!PyBytes_Check(output)) {
3624 Py_DECREF(output);
3625 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3626 return 0;
3627 }
3628 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003629 size = PyBytes_GET_SIZE(output);
3630 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003631 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003632 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003633 Py_DECREF(output);
3634 return 0;
3635 }
3636 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003637 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003638}
3639
3640
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003641int
3642PyUnicode_FSDecoder(PyObject* arg, void* addr)
3643{
3644 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003645 if (arg == NULL) {
3646 Py_DECREF(*(PyObject**)addr);
3647 return 1;
3648 }
3649 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003650 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003651 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003652 output = arg;
3653 Py_INCREF(output);
3654 }
3655 else {
3656 arg = PyBytes_FromObject(arg);
3657 if (!arg)
3658 return 0;
3659 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3660 PyBytes_GET_SIZE(arg));
3661 Py_DECREF(arg);
3662 if (!output)
3663 return 0;
3664 if (!PyUnicode_Check(output)) {
3665 Py_DECREF(output);
3666 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3667 return 0;
3668 }
3669 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003670 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003671 Py_DECREF(output);
3672 return 0;
3673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003674 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003675 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003676 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3677 Py_DECREF(output);
3678 return 0;
3679 }
3680 *(PyObject**)addr = output;
3681 return Py_CLEANUP_SUPPORTED;
3682}
3683
3684
Martin v. Löwis5b222132007-06-10 09:51:05 +00003685char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003686PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003687{
Christian Heimesf3863112007-11-22 07:46:41 +00003688 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003689
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003690 if (!PyUnicode_Check(unicode)) {
3691 PyErr_BadArgument();
3692 return NULL;
3693 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003694 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003695 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003696
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003697 if (PyUnicode_UTF8(unicode) == NULL) {
3698 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003699 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3700 if (bytes == NULL)
3701 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003702 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3703 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704 Py_DECREF(bytes);
3705 return NULL;
3706 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003707 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3708 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3709 PyBytes_AS_STRING(bytes),
3710 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 Py_DECREF(bytes);
3712 }
3713
3714 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003715 *psize = PyUnicode_UTF8_LENGTH(unicode);
3716 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003717}
3718
3719char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3723}
3724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003725Py_UNICODE *
3726PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728 const unsigned char *one_byte;
3729#if SIZEOF_WCHAR_T == 4
3730 const Py_UCS2 *two_bytes;
3731#else
3732 const Py_UCS4 *four_bytes;
3733 const Py_UCS4 *ucs4_end;
3734 Py_ssize_t num_surrogates;
3735#endif
3736 wchar_t *w;
3737 wchar_t *wchar_end;
3738
3739 if (!PyUnicode_Check(unicode)) {
3740 PyErr_BadArgument();
3741 return NULL;
3742 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003743 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003745 assert(_PyUnicode_KIND(unicode) != 0);
3746 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003748 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3751 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 num_surrogates = 0;
3753
3754 for (; four_bytes < ucs4_end; ++four_bytes) {
3755 if (*four_bytes > 0xFFFF)
3756 ++num_surrogates;
3757 }
3758
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3760 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3761 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762 PyErr_NoMemory();
3763 return NULL;
3764 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003767 w = _PyUnicode_WSTR(unicode);
3768 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3769 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3771 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003772 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003774 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3775 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 }
3777 else
3778 *w = *four_bytes;
3779
3780 if (w > wchar_end) {
3781 assert(0 && "Miscalculated string end");
3782 }
3783 }
3784 *w = 0;
3785#else
3786 /* sizeof(wchar_t) == 4 */
3787 Py_FatalError("Impossible unicode object state, wstr and str "
3788 "should share memory already.");
3789 return NULL;
3790#endif
3791 }
3792 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003793 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3794 (_PyUnicode_LENGTH(unicode) + 1));
3795 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 PyErr_NoMemory();
3797 return NULL;
3798 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003799 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3800 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3801 w = _PyUnicode_WSTR(unicode);
3802 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003804 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3805 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 for (; w < wchar_end; ++one_byte, ++w)
3807 *w = *one_byte;
3808 /* null-terminate the wstr */
3809 *w = 0;
3810 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 for (; w < wchar_end; ++two_bytes, ++w)
3815 *w = *two_bytes;
3816 /* null-terminate the wstr */
3817 *w = 0;
3818#else
3819 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003820 PyObject_FREE(_PyUnicode_WSTR(unicode));
3821 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 Py_FatalError("Impossible unicode object state, wstr "
3823 "and str should share memory already.");
3824 return NULL;
3825#endif
3826 }
3827 else {
3828 assert(0 && "This should never happen.");
3829 }
3830 }
3831 }
3832 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 *size = PyUnicode_WSTR_LENGTH(unicode);
3834 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003835}
3836
Alexander Belopolsky40018472011-02-26 01:02:56 +00003837Py_UNICODE *
3838PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841}
3842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843
Alexander Belopolsky40018472011-02-26 01:02:56 +00003844Py_ssize_t
3845PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846{
3847 if (!PyUnicode_Check(unicode)) {
3848 PyErr_BadArgument();
3849 goto onError;
3850 }
3851 return PyUnicode_GET_SIZE(unicode);
3852
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 return -1;
3855}
3856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857Py_ssize_t
3858PyUnicode_GetLength(PyObject *unicode)
3859{
Victor Stinner07621332012-06-16 04:53:46 +02003860 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 PyErr_BadArgument();
3862 return -1;
3863 }
Victor Stinner07621332012-06-16 04:53:46 +02003864 if (PyUnicode_READY(unicode) == -1)
3865 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866 return PyUnicode_GET_LENGTH(unicode);
3867}
3868
3869Py_UCS4
3870PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3871{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003872 void *data;
3873 int kind;
3874
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003875 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3876 PyErr_BadArgument();
3877 return (Py_UCS4)-1;
3878 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003879 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003880 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 return (Py_UCS4)-1;
3882 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003883 data = PyUnicode_DATA(unicode);
3884 kind = PyUnicode_KIND(unicode);
3885 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886}
3887
3888int
3889PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3890{
3891 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003892 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 return -1;
3894 }
Victor Stinner488fa492011-12-12 00:01:39 +01003895 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003896 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003897 PyErr_SetString(PyExc_IndexError, "string index out of range");
3898 return -1;
3899 }
Victor Stinner488fa492011-12-12 00:01:39 +01003900 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003901 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003902 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3903 PyErr_SetString(PyExc_ValueError, "character out of range");
3904 return -1;
3905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3907 index, ch);
3908 return 0;
3909}
3910
Alexander Belopolsky40018472011-02-26 01:02:56 +00003911const char *
3912PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003913{
Victor Stinner42cb4622010-09-01 19:39:01 +00003914 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003915}
3916
Victor Stinner554f3f02010-06-16 23:33:54 +00003917/* create or adjust a UnicodeDecodeError */
3918static void
3919make_decode_exception(PyObject **exceptionObject,
3920 const char *encoding,
3921 const char *input, Py_ssize_t length,
3922 Py_ssize_t startpos, Py_ssize_t endpos,
3923 const char *reason)
3924{
3925 if (*exceptionObject == NULL) {
3926 *exceptionObject = PyUnicodeDecodeError_Create(
3927 encoding, input, length, startpos, endpos, reason);
3928 }
3929 else {
3930 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3931 goto onError;
3932 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3933 goto onError;
3934 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3935 goto onError;
3936 }
3937 return;
3938
3939onError:
3940 Py_DECREF(*exceptionObject);
3941 *exceptionObject = NULL;
3942}
3943
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003944#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945/* error handling callback helper:
3946 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003947 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 and adjust various state variables.
3949 return 0 on success, -1 on error
3950*/
3951
Alexander Belopolsky40018472011-02-26 01:02:56 +00003952static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003953unicode_decode_call_errorhandler_wchar(
3954 const char *errors, PyObject **errorHandler,
3955 const char *encoding, const char *reason,
3956 const char **input, const char **inend, Py_ssize_t *startinpos,
3957 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3958 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003960 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961
3962 PyObject *restuple = NULL;
3963 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003964 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003965 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003966 Py_ssize_t requiredsize;
3967 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003968 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003969 wchar_t *repwstr;
3970 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003972 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3973 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003974
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 *errorHandler = PyCodec_LookupError(errors);
3977 if (*errorHandler == NULL)
3978 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 }
3980
Victor Stinner554f3f02010-06-16 23:33:54 +00003981 make_decode_exception(exceptionObject,
3982 encoding,
3983 *input, *inend - *input,
3984 *startinpos, *endinpos,
3985 reason);
3986 if (*exceptionObject == NULL)
3987 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988
3989 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3990 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003993 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 }
3996 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003998
3999 /* Copy back the bytes variables, which might have been modified by the
4000 callback */
4001 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4002 if (!inputobj)
4003 goto onError;
4004 if (!PyBytes_Check(inputobj)) {
4005 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4006 }
4007 *input = PyBytes_AS_STRING(inputobj);
4008 insize = PyBytes_GET_SIZE(inputobj);
4009 *inend = *input + insize;
4010 /* we can DECREF safely, as the exception has another reference,
4011 so the object won't go away. */
4012 Py_DECREF(inputobj);
4013
4014 if (newpos<0)
4015 newpos = insize+newpos;
4016 if (newpos<0 || newpos>insize) {
4017 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4018 goto onError;
4019 }
4020
4021 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4022 if (repwstr == NULL)
4023 goto onError;
4024 /* need more space? (at least enough for what we
4025 have+the replacement+the rest of the string (starting
4026 at the new input position), so we won't have to check space
4027 when there are no errors in the rest of the string) */
4028 requiredsize = *outpos + repwlen + insize-newpos;
4029 if (requiredsize > outsize) {
4030 if (requiredsize < 2*outsize)
4031 requiredsize = 2*outsize;
4032 if (unicode_resize(output, requiredsize) < 0)
4033 goto onError;
4034 }
4035 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4036 *outpos += repwlen;
4037
4038 *endinpos = newpos;
4039 *inptr = *input + newpos;
4040
4041 /* we made it! */
4042 Py_XDECREF(restuple);
4043 return 0;
4044
4045 onError:
4046 Py_XDECREF(restuple);
4047 return -1;
4048}
4049#endif /* HAVE_MBCS */
4050
4051static int
4052unicode_decode_call_errorhandler_writer(
4053 const char *errors, PyObject **errorHandler,
4054 const char *encoding, const char *reason,
4055 const char **input, const char **inend, Py_ssize_t *startinpos,
4056 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4057 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4058{
4059 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4060
4061 PyObject *restuple = NULL;
4062 PyObject *repunicode = NULL;
4063 Py_ssize_t insize;
4064 Py_ssize_t newpos;
4065 PyObject *inputobj = NULL;
4066
4067 if (*errorHandler == NULL) {
4068 *errorHandler = PyCodec_LookupError(errors);
4069 if (*errorHandler == NULL)
4070 goto onError;
4071 }
4072
4073 make_decode_exception(exceptionObject,
4074 encoding,
4075 *input, *inend - *input,
4076 *startinpos, *endinpos,
4077 reason);
4078 if (*exceptionObject == NULL)
4079 goto onError;
4080
4081 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4082 if (restuple == NULL)
4083 goto onError;
4084 if (!PyTuple_Check(restuple)) {
4085 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4086 goto onError;
4087 }
4088 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004089 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004090
4091 /* Copy back the bytes variables, which might have been modified by the
4092 callback */
4093 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4094 if (!inputobj)
4095 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004096 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004098 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004099 *input = PyBytes_AS_STRING(inputobj);
4100 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004101 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004102 /* we can DECREF safely, as the exception has another reference,
4103 so the object won't go away. */
4104 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004108 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4110 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004111 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004113 writer->overallocate = 1;
4114 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4115 return
4116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004118 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004121 Py_XDECREF(restuple);
4122 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004126 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127}
4128
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004129/* --- UTF-7 Codec -------------------------------------------------------- */
4130
Antoine Pitrou244651a2009-05-04 18:56:13 +00004131/* See RFC2152 for details. We encode conservatively and decode liberally. */
4132
4133/* Three simple macros defining base-64. */
4134
4135/* Is c a base-64 character? */
4136
4137#define IS_BASE64(c) \
4138 (((c) >= 'A' && (c) <= 'Z') || \
4139 ((c) >= 'a' && (c) <= 'z') || \
4140 ((c) >= '0' && (c) <= '9') || \
4141 (c) == '+' || (c) == '/')
4142
4143/* given that c is a base-64 character, what is its base-64 value? */
4144
4145#define FROM_BASE64(c) \
4146 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4147 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4148 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4149 (c) == '+' ? 62 : 63)
4150
4151/* What is the base-64 character of the bottom 6 bits of n? */
4152
4153#define TO_BASE64(n) \
4154 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4155
4156/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4157 * decoded as itself. We are permissive on decoding; the only ASCII
4158 * byte not decoding to itself is the + which begins a base64
4159 * string. */
4160
4161#define DECODE_DIRECT(c) \
4162 ((c) <= 127 && (c) != '+')
4163
4164/* The UTF-7 encoder treats ASCII characters differently according to
4165 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4166 * the above). See RFC2152. This array identifies these different
4167 * sets:
4168 * 0 : "Set D"
4169 * alphanumeric and '(),-./:?
4170 * 1 : "Set O"
4171 * !"#$%&*;<=>@[]^_`{|}
4172 * 2 : "whitespace"
4173 * ht nl cr sp
4174 * 3 : special (must be base64 encoded)
4175 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4176 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004177
Tim Petersced69f82003-09-16 20:30:58 +00004178static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004179char utf7_category[128] = {
4180/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4181 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4182/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4184/* sp ! " # $ % & ' ( ) * + , - . / */
4185 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4186/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4188/* @ A B C D E F G H I J K L M N O */
4189 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4190/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4192/* ` a b c d e f g h i j k l m n o */
4193 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4194/* p q r s t u v w x y z { | } ~ del */
4195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004196};
4197
Antoine Pitrou244651a2009-05-04 18:56:13 +00004198/* ENCODE_DIRECT: this character should be encoded as itself. The
4199 * answer depends on whether we are encoding set O as itself, and also
4200 * on whether we are encoding whitespace as itself. RFC2152 makes it
4201 * clear that the answers to these questions vary between
4202 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004203
Antoine Pitrou244651a2009-05-04 18:56:13 +00004204#define ENCODE_DIRECT(c, directO, directWS) \
4205 ((c) < 128 && (c) > 0 && \
4206 ((utf7_category[(c)] == 0) || \
4207 (directWS && (utf7_category[(c)] == 2)) || \
4208 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004209
Alexander Belopolsky40018472011-02-26 01:02:56 +00004210PyObject *
4211PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004212 Py_ssize_t size,
4213 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004214{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004215 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4216}
4217
Antoine Pitrou244651a2009-05-04 18:56:13 +00004218/* The decoder. The only state we preserve is our read position,
4219 * i.e. how many characters we have consumed. So if we end in the
4220 * middle of a shift sequence we have to back off the read position
4221 * and the output to the beginning of the sequence, otherwise we lose
4222 * all the shift state (seen bits, number of bits seen, high
4223 * surrogate). */
4224
Alexander Belopolsky40018472011-02-26 01:02:56 +00004225PyObject *
4226PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004227 Py_ssize_t size,
4228 const char *errors,
4229 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004230{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004232 Py_ssize_t startinpos;
4233 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004234 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004236 const char *errmsg = "";
4237 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004238 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004239 unsigned int base64bits = 0;
4240 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004241 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 PyObject *errorHandler = NULL;
4243 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004244
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004245 if (size == 0) {
4246 if (consumed)
4247 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004248 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004249 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004250
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251 /* Start off assuming it's all ASCII. Widen later as necessary. */
4252 _PyUnicodeWriter_Init(&writer, 0);
4253 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4254 goto onError;
4255
4256 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004257 e = s + size;
4258
4259 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004260 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004262 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263
Antoine Pitrou244651a2009-05-04 18:56:13 +00004264 if (inShift) { /* in a base-64 section */
4265 if (IS_BASE64(ch)) { /* consume a base-64 character */
4266 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4267 base64bits += 6;
4268 s++;
4269 if (base64bits >= 16) {
4270 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004271 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272 base64bits -= 16;
4273 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4274 if (surrogate) {
4275 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004276 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4277 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004278 if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004279 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004280 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2);
4281 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004283 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004284 }
4285 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004287 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004288 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4289 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004291 }
4292 }
Victor Stinner551ac952011-11-29 22:58:13 +01004293 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004294 /* first surrogate */
4295 surrogate = outCh;
4296 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004299 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004300 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh);
4301 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004302 }
4303 }
4304 }
4305 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306 inShift = 0;
4307 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004308 if (surrogate) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004310 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4312 writer.pos++;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004313 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004314 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315 if (base64bits > 0) { /* left-over bits */
4316 if (base64bits >= 6) {
4317 /* We've seen at least one base-64 character */
4318 errmsg = "partial character in shift sequence";
4319 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004320 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 else {
4322 /* Some bits remain; they should be zero */
4323 if (base64buffer != 0) {
4324 errmsg = "non-zero padding bits in shift sequence";
4325 goto utf7Error;
4326 }
4327 }
4328 }
4329 if (ch != '-') {
4330 /* '-' is absorbed; other terminating
4331 characters are preserved */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004334 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4335 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004337 }
4338 }
4339 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 s++; /* consume '+' */
4342 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004345 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+');
4347 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 }
4349 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 }
4354 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004356 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004357 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4358 goto onError;
4359 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4360 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 else {
4363 startinpos = s-starts;
4364 s++;
4365 errmsg = "unexpected special character";
4366 goto utf7Error;
4367 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 errors, &errorHandler,
4373 "utf7", errmsg,
4374 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004375 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 }
4378
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 /* end of string */
4380
4381 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4382 /* if we're in an inconsistent state, that's an error */
4383 if (surrogate ||
4384 (base64bits >= 6) ||
4385 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004387 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 errors, &errorHandler,
4389 "utf7", "unterminated shift sequence",
4390 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004391 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392 goto onError;
4393 if (s < e)
4394 goto restart;
4395 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397
4398 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004399 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004401 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004402 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 }
4404 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004405 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004407 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 Py_XDECREF(errorHandler);
4410 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 Py_XDECREF(errorHandler);
4415 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 return NULL;
4418}
4419
4420
Alexander Belopolsky40018472011-02-26 01:02:56 +00004421PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004422_PyUnicode_EncodeUTF7(PyObject *str,
4423 int base64SetO,
4424 int base64WhiteSpace,
4425 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004427 int kind;
4428 void *data;
4429 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004430 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004432 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 unsigned int base64bits = 0;
4434 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435 char * out;
4436 char * start;
4437
Benjamin Petersonbac79492012-01-14 13:34:47 -05004438 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004439 return NULL;
4440 kind = PyUnicode_KIND(str);
4441 data = PyUnicode_DATA(str);
4442 len = PyUnicode_GET_LENGTH(str);
4443
4444 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004446
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004447 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004448 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004449 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004450 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451 if (v == NULL)
4452 return NULL;
4453
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004454 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004455 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004456 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 if (inShift) {
4459 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4460 /* shifting out */
4461 if (base64bits) { /* output remaining bits */
4462 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4463 base64buffer = 0;
4464 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 }
4466 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 /* Characters not in the BASE64 set implicitly unshift the sequence
4468 so no '-' is required, except if the character is itself a '-' */
4469 if (IS_BASE64(ch) || ch == '-') {
4470 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 *out++ = (char) ch;
4473 }
4474 else {
4475 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004476 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 else { /* not in a shift sequence */
4479 if (ch == '+') {
4480 *out++ = '+';
4481 *out++ = '-';
4482 }
4483 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4484 *out++ = (char) ch;
4485 }
4486 else {
4487 *out++ = '+';
4488 inShift = 1;
4489 goto encode_char;
4490 }
4491 }
4492 continue;
4493encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004495 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004496
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 /* code first surrogate */
4498 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004499 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 while (base64bits >= 6) {
4501 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4502 base64bits -= 6;
4503 }
4504 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004505 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 base64bits += 16;
4508 base64buffer = (base64buffer << 16) | ch;
4509 while (base64bits >= 6) {
4510 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4511 base64bits -= 6;
4512 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004513 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 if (base64bits)
4515 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4516 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004518 if (_PyBytes_Resize(&v, out - start) < 0)
4519 return NULL;
4520 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004522PyObject *
4523PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4524 Py_ssize_t size,
4525 int base64SetO,
4526 int base64WhiteSpace,
4527 const char *errors)
4528{
4529 PyObject *result;
4530 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4531 if (tmp == NULL)
4532 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004533 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004534 base64WhiteSpace, errors);
4535 Py_DECREF(tmp);
4536 return result;
4537}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539#undef IS_BASE64
4540#undef FROM_BASE64
4541#undef TO_BASE64
4542#undef DECODE_DIRECT
4543#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545/* --- UTF-8 Codec -------------------------------------------------------- */
4546
Alexander Belopolsky40018472011-02-26 01:02:56 +00004547PyObject *
4548PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004549 Py_ssize_t size,
4550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551{
Walter Dörwald69652032004-09-07 20:24:22 +00004552 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4553}
4554
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004555#include "stringlib/asciilib.h"
4556#include "stringlib/codecs.h"
4557#include "stringlib/undef.h"
4558
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004559#include "stringlib/ucs1lib.h"
4560#include "stringlib/codecs.h"
4561#include "stringlib/undef.h"
4562
4563#include "stringlib/ucs2lib.h"
4564#include "stringlib/codecs.h"
4565#include "stringlib/undef.h"
4566
4567#include "stringlib/ucs4lib.h"
4568#include "stringlib/codecs.h"
4569#include "stringlib/undef.h"
4570
Antoine Pitrouab868312009-01-10 15:40:25 +00004571/* Mask to quickly check whether a C 'long' contains a
4572 non-ASCII, UTF8-encoded char. */
4573#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004574# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004575#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004576# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004577#else
4578# error C 'long' size should be either 4 or 8!
4579#endif
4580
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004581static Py_ssize_t
4582ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004584 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004585 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004586
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004587#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004588 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4589 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004590 /* Fast path, see in STRINGLIB(utf8_decode) for
4591 an explanation. */
4592 /* Help register allocation */
4593 register const char *_p = p;
4594 register Py_UCS1 * q = dest;
4595 while (_p < aligned_end) {
4596 unsigned long value = *(const unsigned long *) _p;
4597 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004599 *((unsigned long *)q) = value;
4600 _p += SIZEOF_LONG;
4601 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004602 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004603 p = _p;
4604 while (p < end) {
4605 if ((unsigned char)*p & 0x80)
4606 break;
4607 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004609 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004611#endif
4612 while (p < end) {
4613 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4614 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004615 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004616 /* Help register allocation */
4617 register const char *_p = p;
4618 while (_p < aligned_end) {
4619 unsigned long value = *(unsigned long *) _p;
4620 if (value & ASCII_CHAR_MASK)
4621 break;
4622 _p += SIZEOF_LONG;
4623 }
4624 p = _p;
4625 if (_p == end)
4626 break;
4627 }
4628 if ((unsigned char)*p & 0x80)
4629 break;
4630 ++p;
4631 }
4632 memcpy(dest, start, p - start);
4633 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634}
Antoine Pitrouab868312009-01-10 15:40:25 +00004635
Victor Stinner785938e2011-12-11 20:09:03 +01004636PyObject *
4637PyUnicode_DecodeUTF8Stateful(const char *s,
4638 Py_ssize_t size,
4639 const char *errors,
4640 Py_ssize_t *consumed)
4641{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004642 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004643 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004645
4646 Py_ssize_t startinpos;
4647 Py_ssize_t endinpos;
4648 const char *errmsg = "";
4649 PyObject *errorHandler = NULL;
4650 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004651
4652 if (size == 0) {
4653 if (consumed)
4654 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004655 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004656 }
4657
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4659 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004660 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004661 *consumed = 1;
4662 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004663 }
4664
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004665 _PyUnicodeWriter_Init(&writer, 0);
4666 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4667 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004668
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 writer.pos = ascii_decode(s, end, writer.data);
4670 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671 while (s < end) {
4672 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004675 if (PyUnicode_IS_ASCII(writer.buffer))
4676 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004678 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004679 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 } else {
4682 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004683 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684 }
4685
4686 switch (ch) {
4687 case 0:
4688 if (s == end || consumed)
4689 goto End;
4690 errmsg = "unexpected end of data";
4691 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004692 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 break;
4694 case 1:
4695 errmsg = "invalid start byte";
4696 startinpos = s - starts;
4697 endinpos = startinpos + 1;
4698 break;
4699 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004700 case 3:
4701 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 errmsg = "invalid continuation byte";
4703 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004704 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004705 break;
4706 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004707 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004708 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004709 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4710 writer.pos++;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 continue;
4712 }
4713
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004714 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004715 errors, &errorHandler,
4716 "utf-8", errmsg,
4717 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004719 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004720 }
4721
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723 if (consumed)
4724 *consumed = s - starts;
4725
4726 Py_XDECREF(errorHandler);
4727 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004728 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729
4730onError:
4731 Py_XDECREF(errorHandler);
4732 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004733 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004735}
4736
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004737#ifdef __APPLE__
4738
4739/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004740 used to decode the command line arguments on Mac OS X.
4741
4742 Return a pointer to a newly allocated wide character string (use
4743 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004744
4745wchar_t*
4746_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4747{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004748 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749 wchar_t *unicode;
4750 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004751
4752 /* Note: size will always be longer than the resulting Unicode
4753 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004754 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004755 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004756 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4757 if (!unicode)
4758 return NULL;
4759
4760 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004761 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004763 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004765#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004766 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004767#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004769#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 if (ch > 0xFF) {
4771#if SIZEOF_WCHAR_T == 4
4772 assert(0);
4773#else
4774 assert(Py_UNICODE_IS_SURROGATE(ch));
4775 /* compute and append the two surrogates: */
4776 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4777 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4778#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004779 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004780 else {
4781 if (!ch && s == e)
4782 break;
4783 /* surrogateescape */
4784 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4785 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004786 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004788 return unicode;
4789}
4790
4791#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004793/* Primary internal function which creates utf8 encoded bytes objects.
4794
4795 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004796 and allocate exactly as much space needed at the end. Else allocate the
4797 maximum possible needed (4 result bytes per Unicode character), and return
4798 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004799*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004800PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004801_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802{
Victor Stinner6099a032011-12-18 14:22:26 +01004803 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004804 void *data;
4805 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004807 if (!PyUnicode_Check(unicode)) {
4808 PyErr_BadArgument();
4809 return NULL;
4810 }
4811
4812 if (PyUnicode_READY(unicode) == -1)
4813 return NULL;
4814
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004815 if (PyUnicode_UTF8(unicode))
4816 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4817 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004818
4819 kind = PyUnicode_KIND(unicode);
4820 data = PyUnicode_DATA(unicode);
4821 size = PyUnicode_GET_LENGTH(unicode);
4822
Benjamin Petersonead6b532011-12-20 17:23:42 -06004823 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004824 default:
4825 assert(0);
4826 case PyUnicode_1BYTE_KIND:
4827 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4828 assert(!PyUnicode_IS_ASCII(unicode));
4829 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4830 case PyUnicode_2BYTE_KIND:
4831 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4832 case PyUnicode_4BYTE_KIND:
4833 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835}
4836
Alexander Belopolsky40018472011-02-26 01:02:56 +00004837PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004838PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4839 Py_ssize_t size,
4840 const char *errors)
4841{
4842 PyObject *v, *unicode;
4843
4844 unicode = PyUnicode_FromUnicode(s, size);
4845 if (unicode == NULL)
4846 return NULL;
4847 v = _PyUnicode_AsUTF8String(unicode, errors);
4848 Py_DECREF(unicode);
4849 return v;
4850}
4851
4852PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004853PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004855 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856}
4857
Walter Dörwald41980ca2007-08-16 21:55:45 +00004858/* --- UTF-32 Codec ------------------------------------------------------- */
4859
4860PyObject *
4861PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 Py_ssize_t size,
4863 const char *errors,
4864 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004865{
4866 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4867}
4868
4869PyObject *
4870PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 Py_ssize_t size,
4872 const char *errors,
4873 int *byteorder,
4874 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004875{
4876 const char *starts = s;
4877 Py_ssize_t startinpos;
4878 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004879 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004880 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004881 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004882 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004883 PyObject *errorHandler = NULL;
4884 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004885
Walter Dörwald41980ca2007-08-16 21:55:45 +00004886 q = (unsigned char *)s;
4887 e = q + size;
4888
4889 if (byteorder)
4890 bo = *byteorder;
4891
4892 /* Check for BOM marks (U+FEFF) in the input and adjust current
4893 byte order setting accordingly. In native mode, the leading BOM
4894 mark is skipped, in all other modes, it is copied to the output
4895 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004896 if (bo == 0 && size >= 4) {
4897 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4898 if (bom == 0x0000FEFF) {
4899 bo = -1;
4900 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004902 else if (bom == 0xFFFE0000) {
4903 bo = 1;
4904 q += 4;
4905 }
4906 if (byteorder)
4907 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004908 }
4909
Victor Stinnere64322e2012-10-30 23:12:47 +01004910 if (q == e) {
4911 if (consumed)
4912 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004913 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004914 }
4915
Victor Stinnere64322e2012-10-30 23:12:47 +01004916#ifdef WORDS_BIGENDIAN
4917 le = bo < 0;
4918#else
4919 le = bo <= 0;
4920#endif
4921
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004922 _PyUnicodeWriter_Init(&writer, 0);
4923 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4924 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004925
Victor Stinnere64322e2012-10-30 23:12:47 +01004926 while (1) {
4927 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004928 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004929
Victor Stinnere64322e2012-10-30 23:12:47 +01004930 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004931 enum PyUnicode_Kind kind = writer.kind;
4932 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004933 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004934 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004935 if (le) {
4936 do {
4937 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4938 if (ch > maxch)
4939 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004940 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004941 q += 4;
4942 } while (q <= last);
4943 }
4944 else {
4945 do {
4946 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4947 if (ch > maxch)
4948 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004949 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004950 q += 4;
4951 } while (q <= last);
4952 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004953 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004954 }
4955
4956 if (ch <= maxch) {
4957 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004959 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004961 startinpos = ((const char *)q) - starts;
4962 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004964 else {
4965 if (ch < 0x110000) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004966 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinnere64322e2012-10-30 23:12:47 +01004967 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004968 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4969 writer.pos++;
Victor Stinnere64322e2012-10-30 23:12:47 +01004970 q += 4;
4971 continue;
4972 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004974 startinpos = ((const char *)q) - starts;
4975 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004977
4978 /* The remaining input chars are ignored if the callback
4979 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004980 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 errors, &errorHandler,
4982 "utf32", errmsg,
4983 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004984 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986 }
4987
Walter Dörwald41980ca2007-08-16 21:55:45 +00004988 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991 Py_XDECREF(errorHandler);
4992 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004994
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997 Py_XDECREF(errorHandler);
4998 Py_XDECREF(exc);
4999 return NULL;
5000}
5001
5002PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005003_PyUnicode_EncodeUTF32(PyObject *str,
5004 const char *errors,
5005 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005007 int kind;
5008 void *data;
5009 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005010 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005012 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005013 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005014#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005015 int iorder[] = {0, 1, 2, 3};
5016#else
5017 int iorder[] = {3, 2, 1, 0};
5018#endif
5019
Benjamin Peterson29060642009-01-31 22:14:21 +00005020#define STORECHAR(CH) \
5021 do { \
5022 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5023 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5024 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5025 p[iorder[0]] = (CH) & 0xff; \
5026 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005027 } while(0)
5028
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005029 if (!PyUnicode_Check(str)) {
5030 PyErr_BadArgument();
5031 return NULL;
5032 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005033 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005034 return NULL;
5035 kind = PyUnicode_KIND(str);
5036 data = PyUnicode_DATA(str);
5037 len = PyUnicode_GET_LENGTH(str);
5038
5039 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005040 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005042 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043 if (v == NULL)
5044 return NULL;
5045
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005046 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005049 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005050 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051
5052 if (byteorder == -1) {
5053 /* force LE */
5054 iorder[0] = 0;
5055 iorder[1] = 1;
5056 iorder[2] = 2;
5057 iorder[3] = 3;
5058 }
5059 else if (byteorder == 1) {
5060 /* force BE */
5061 iorder[0] = 3;
5062 iorder[1] = 2;
5063 iorder[2] = 1;
5064 iorder[3] = 0;
5065 }
5066
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005067 for (i = 0; i < len; i++)
5068 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005069
5070 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005071 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072#undef STORECHAR
5073}
5074
Alexander Belopolsky40018472011-02-26 01:02:56 +00005075PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005076PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5077 Py_ssize_t size,
5078 const char *errors,
5079 int byteorder)
5080{
5081 PyObject *result;
5082 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5083 if (tmp == NULL)
5084 return NULL;
5085 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5086 Py_DECREF(tmp);
5087 return result;
5088}
5089
5090PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005091PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092{
Victor Stinnerb960b342011-11-20 19:12:52 +01005093 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094}
5095
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096/* --- UTF-16 Codec ------------------------------------------------------- */
5097
Tim Peters772747b2001-08-09 22:21:55 +00005098PyObject *
5099PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 Py_ssize_t size,
5101 const char *errors,
5102 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103{
Walter Dörwald69652032004-09-07 20:24:22 +00005104 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5105}
5106
5107PyObject *
5108PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 Py_ssize_t size,
5110 const char *errors,
5111 int *byteorder,
5112 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005113{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005115 Py_ssize_t startinpos;
5116 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005117 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005118 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005119 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005120 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005121 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 PyObject *errorHandler = NULL;
5123 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124
Tim Peters772747b2001-08-09 22:21:55 +00005125 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005126 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127
5128 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005129 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005131 /* Check for BOM marks (U+FEFF) in the input and adjust current
5132 byte order setting accordingly. In native mode, the leading BOM
5133 mark is skipped, in all other modes, it is copied to the output
5134 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005135 if (bo == 0 && size >= 2) {
5136 const Py_UCS4 bom = (q[1] << 8) | q[0];
5137 if (bom == 0xFEFF) {
5138 q += 2;
5139 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005141 else if (bom == 0xFFFE) {
5142 q += 2;
5143 bo = 1;
5144 }
5145 if (byteorder)
5146 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148
Antoine Pitrou63065d72012-05-15 23:48:04 +02005149 if (q == e) {
5150 if (consumed)
5151 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005152 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005153 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005154
Christian Heimes743e0cd2012-10-17 23:52:17 +02005155#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005156 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005157#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005158 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005159#endif
Tim Peters772747b2001-08-09 22:21:55 +00005160
Antoine Pitrou63065d72012-05-15 23:48:04 +02005161 /* Note: size will always be longer than the resulting Unicode
5162 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005163 _PyUnicodeWriter_Init(&writer, 0);
5164 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5165 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005166
Antoine Pitrou63065d72012-05-15 23:48:04 +02005167 while (1) {
5168 Py_UCS4 ch = 0;
5169 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005170 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005171 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005172 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005173 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005174 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005175 native_ordering);
5176 else
5177 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005178 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005179 native_ordering);
5180 } else if (kind == PyUnicode_2BYTE_KIND) {
5181 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005182 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005183 native_ordering);
5184 } else {
5185 assert(kind == PyUnicode_4BYTE_KIND);
5186 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005187 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005188 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005189 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005190 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005191
Antoine Pitrou63065d72012-05-15 23:48:04 +02005192 switch (ch)
5193 {
5194 case 0:
5195 /* remaining byte at the end? (size should be even) */
5196 if (q == e || consumed)
5197 goto End;
5198 errmsg = "truncated data";
5199 startinpos = ((const char *)q) - starts;
5200 endinpos = ((const char *)e) - starts;
5201 break;
5202 /* The remaining input chars are ignored if the callback
5203 chooses to skip the input */
5204 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005205 q -= 2;
5206 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005207 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005208 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005209 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005210 endinpos = ((const char *)e) - starts;
5211 break;
5212 case 2:
5213 errmsg = "illegal encoding";
5214 startinpos = ((const char *)q) - 2 - starts;
5215 endinpos = startinpos + 2;
5216 break;
5217 case 3:
5218 errmsg = "illegal UTF-16 surrogate";
5219 startinpos = ((const char *)q) - 4 - starts;
5220 endinpos = startinpos + 2;
5221 break;
5222 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005223 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005224 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005225 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
5226 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 continue;
5228 }
5229
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005230 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005231 errors,
5232 &errorHandler,
5233 "utf16", errmsg,
5234 &starts,
5235 (const char **)&e,
5236 &startinpos,
5237 &endinpos,
5238 &exc,
5239 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005240 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 }
5243
Antoine Pitrou63065d72012-05-15 23:48:04 +02005244End:
Walter Dörwald69652032004-09-07 20:24:22 +00005245 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005248 Py_XDECREF(errorHandler);
5249 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005250 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005253 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005254 Py_XDECREF(errorHandler);
5255 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 return NULL;
5257}
5258
Tim Peters772747b2001-08-09 22:21:55 +00005259PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005260_PyUnicode_EncodeUTF16(PyObject *str,
5261 const char *errors,
5262 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005264 enum PyUnicode_Kind kind;
5265 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005266 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005267 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005268 unsigned short *out;
5269 Py_ssize_t bytesize;
5270 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005271#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005272 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005273#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005274 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005275#endif
5276
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005277 if (!PyUnicode_Check(str)) {
5278 PyErr_BadArgument();
5279 return NULL;
5280 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005281 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005282 return NULL;
5283 kind = PyUnicode_KIND(str);
5284 data = PyUnicode_DATA(str);
5285 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005286
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005287 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005288 if (kind == PyUnicode_4BYTE_KIND) {
5289 const Py_UCS4 *in = (const Py_UCS4 *)data;
5290 const Py_UCS4 *end = in + len;
5291 while (in < end)
5292 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005293 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005294 }
5295 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005297 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005298 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 if (v == NULL)
5300 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005302 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005303 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005304 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005306 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005307 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005308 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005309
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005310 switch (kind) {
5311 case PyUnicode_1BYTE_KIND: {
5312 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5313 break;
Tim Peters772747b2001-08-09 22:21:55 +00005314 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005315 case PyUnicode_2BYTE_KIND: {
5316 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5317 break;
Tim Peters772747b2001-08-09 22:21:55 +00005318 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005319 case PyUnicode_4BYTE_KIND: {
5320 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5321 break;
5322 }
5323 default:
5324 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005325 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005326
5327 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005328 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329}
5330
Alexander Belopolsky40018472011-02-26 01:02:56 +00005331PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005332PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5333 Py_ssize_t size,
5334 const char *errors,
5335 int byteorder)
5336{
5337 PyObject *result;
5338 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5339 if (tmp == NULL)
5340 return NULL;
5341 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5342 Py_DECREF(tmp);
5343 return result;
5344}
5345
5346PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005347PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005349 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350}
5351
5352/* --- Unicode Escape Codec ----------------------------------------------- */
5353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005354/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5355 if all the escapes in the string make it still a valid ASCII string.
5356 Returns -1 if any escapes were found which cause the string to
5357 pop out of ASCII range. Otherwise returns the length of the
5358 required buffer to hold the string.
5359 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005360static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005361length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5362{
5363 const unsigned char *p = (const unsigned char *)s;
5364 const unsigned char *end = p + size;
5365 Py_ssize_t length = 0;
5366
5367 if (size < 0)
5368 return -1;
5369
5370 for (; p < end; ++p) {
5371 if (*p > 127) {
5372 /* Non-ASCII */
5373 return -1;
5374 }
5375 else if (*p != '\\') {
5376 /* Normal character */
5377 ++length;
5378 }
5379 else {
5380 /* Backslash-escape, check next char */
5381 ++p;
5382 /* Escape sequence reaches till end of string or
5383 non-ASCII follow-up. */
5384 if (p >= end || *p > 127)
5385 return -1;
5386 switch (*p) {
5387 case '\n':
5388 /* backslash + \n result in zero characters */
5389 break;
5390 case '\\': case '\'': case '\"':
5391 case 'b': case 'f': case 't':
5392 case 'n': case 'r': case 'v': case 'a':
5393 ++length;
5394 break;
5395 case '0': case '1': case '2': case '3':
5396 case '4': case '5': case '6': case '7':
5397 case 'x': case 'u': case 'U': case 'N':
5398 /* these do not guarantee ASCII characters */
5399 return -1;
5400 default:
5401 /* count the backslash + the other character */
5402 length += 2;
5403 }
5404 }
5405 }
5406 return length;
5407}
5408
Fredrik Lundh06d12682001-01-24 07:59:11 +00005409static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005410
Alexander Belopolsky40018472011-02-26 01:02:56 +00005411PyObject *
5412PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005413 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005414 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005417 Py_ssize_t startinpos;
5418 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005419 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005421 char* message;
5422 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005423 PyObject *errorHandler = NULL;
5424 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005425 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005426
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005427 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005428 if (len == 0)
5429 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005430
5431 /* After length_of_escaped_ascii_string() there are two alternatives,
5432 either the string is pure ASCII with named escapes like \n, etc.
5433 and we determined it's exact size (common case)
5434 or it contains \x, \u, ... escape sequences. then we create a
5435 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005436 _PyUnicodeWriter_Init(&writer, 0);
5437 if (len > 0) {
5438 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005440 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 }
5442 else {
5443 /* Escaped strings will always be longer than the resulting
5444 Unicode string, so we start with size here and then reduce the
5445 length after conversion to the true value.
5446 (but if the error callback returns a long replacement string
5447 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005448 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450 }
5451
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005453 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005455
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 while (s < end) {
5457 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005458 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460
5461 /* Non-escape characters are interpreted as Unicode ordinals */
5462 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005463 x = (unsigned char)*s;
5464 s++;
5465 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005466 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005467 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5468 writer.pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 continue;
5470 }
5471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 /* \ - Escapes */
5474 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005475 c = *s++;
5476 if (s > end)
5477 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005479 /* The only case in which i == ascii_length is a backslash
5480 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005481 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005483 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005486#define WRITECHAR(ch) \
5487 do { \
5488 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) \
5489 goto onError; \
5490 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \
5491 writer.pos++; \
5492 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005493
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005495 case '\\': WRITECHAR('\\'); break;
5496 case '\'': WRITECHAR('\''); break;
5497 case '\"': WRITECHAR('\"'); break;
5498 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005499 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005500 case 'f': WRITECHAR('\014'); break;
5501 case 't': WRITECHAR('\t'); break;
5502 case 'n': WRITECHAR('\n'); break;
5503 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005504 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005505 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005506 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005507 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 case '0': case '1': case '2': case '3':
5511 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005512 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005513 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005514 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005515 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005516 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005518 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 break;
5520
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 /* hex escapes */
5522 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005524 digits = 2;
5525 message = "truncated \\xXX escape";
5526 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005530 digits = 4;
5531 message = "truncated \\uXXXX escape";
5532 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005535 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005536 digits = 8;
5537 message = "truncated \\UXXXXXXXX escape";
5538 hexescape:
5539 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005540 if (end - s < digits) {
5541 /* count only hex digits */
5542 for (; s < end; ++s) {
5543 c = (unsigned char)*s;
5544 if (!Py_ISXDIGIT(c))
5545 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005546 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005547 goto error;
5548 }
5549 for (; digits--; ++s) {
5550 c = (unsigned char)*s;
5551 if (!Py_ISXDIGIT(c))
5552 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005553 chr = (chr<<4) & ~0xF;
5554 if (c >= '0' && c <= '9')
5555 chr += c - '0';
5556 else if (c >= 'a' && c <= 'f')
5557 chr += 10 + c - 'a';
5558 else
5559 chr += 10 + c - 'A';
5560 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005561 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005562 /* _decoding_error will have already written into the
5563 target buffer. */
5564 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005565 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005566 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005567 message = "illegal Unicode character";
5568 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005569 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005570 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005571 break;
5572
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005574 case 'N':
5575 message = "malformed \\N character escape";
5576 if (ucnhash_CAPI == NULL) {
5577 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5579 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005580 if (ucnhash_CAPI == NULL)
5581 goto ucnhashError;
5582 }
5583 if (*s == '{') {
5584 const char *start = s+1;
5585 /* look for the closing brace */
5586 while (*s != '}' && s < end)
5587 s++;
5588 if (s > start && s < end && *s == '}') {
5589 /* found a name. look it up in the unicode database */
5590 message = "unknown Unicode character name";
5591 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005592 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005593 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005594 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005595 goto store;
5596 }
5597 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005598 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005599
5600 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005601 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 message = "\\ at end of string";
5603 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005604 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005605 }
5606 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005607 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005608 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005609 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005610 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005612 continue;
5613
5614 error:
5615 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005616 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005617 errors, &errorHandler,
5618 "unicodeescape", message,
5619 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005620 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005621 goto onError;
5622 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005624#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005626 Py_XDECREF(errorHandler);
5627 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005629
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005631 PyErr_SetString(
5632 PyExc_UnicodeError,
5633 "\\N escapes not supported (can't load unicodedata module)"
5634 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 Py_XDECREF(errorHandler);
5637 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005638 return NULL;
5639
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642 Py_XDECREF(errorHandler);
5643 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 return NULL;
5645}
5646
5647/* Return a Unicode-Escape string version of the Unicode object.
5648
5649 If quotes is true, the string is enclosed in u"" or u'' quotes as
5650 appropriate.
5651
5652*/
5653
Alexander Belopolsky40018472011-02-26 01:02:56 +00005654PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005655PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005658 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660 int kind;
5661 void *data;
5662 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663
Ezio Melottie7f90372012-10-05 03:33:31 +03005664 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005665 escape.
5666
Ezio Melottie7f90372012-10-05 03:33:31 +03005667 For UCS1 strings it's '\xxx', 4 bytes per source character.
5668 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5669 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005670 */
5671
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005672 if (!PyUnicode_Check(unicode)) {
5673 PyErr_BadArgument();
5674 return NULL;
5675 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005676 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005677 return NULL;
5678 len = PyUnicode_GET_LENGTH(unicode);
5679 kind = PyUnicode_KIND(unicode);
5680 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005681 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005682 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5683 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5684 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5685 }
5686
5687 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005688 return PyBytes_FromStringAndSize(NULL, 0);
5689
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005690 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005692
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005693 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005695 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 if (repr == NULL)
5698 return NULL;
5699
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005700 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005702 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005703 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005704
Walter Dörwald79e913e2007-05-12 11:08:06 +00005705 /* Escape backslashes */
5706 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 *p++ = '\\';
5708 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005709 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005710 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005711
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005712 /* Map 21-bit characters to '\U00xxxxxx' */
5713 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005714 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005715 *p++ = '\\';
5716 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005717 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5718 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5719 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5720 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5721 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5722 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5723 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5724 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005726 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005729 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 *p++ = '\\';
5731 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005732 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5733 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5734 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5735 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005737
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005738 /* Map special whitespace to '\t', \n', '\r' */
5739 else if (ch == '\t') {
5740 *p++ = '\\';
5741 *p++ = 't';
5742 }
5743 else if (ch == '\n') {
5744 *p++ = '\\';
5745 *p++ = 'n';
5746 }
5747 else if (ch == '\r') {
5748 *p++ = '\\';
5749 *p++ = 'r';
5750 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005751
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005752 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005753 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005755 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005756 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5757 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005758 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005759
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 /* Copy everything else as-is */
5761 else
5762 *p++ = (char) ch;
5763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005765 assert(p - PyBytes_AS_STRING(repr) > 0);
5766 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5767 return NULL;
5768 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769}
5770
Alexander Belopolsky40018472011-02-26 01:02:56 +00005771PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005772PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5773 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005775 PyObject *result;
5776 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5777 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005779 result = PyUnicode_AsUnicodeEscapeString(tmp);
5780 Py_DECREF(tmp);
5781 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782}
5783
5784/* --- Raw Unicode Escape Codec ------------------------------------------- */
5785
Alexander Belopolsky40018472011-02-26 01:02:56 +00005786PyObject *
5787PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005788 Py_ssize_t size,
5789 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792 Py_ssize_t startinpos;
5793 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005794 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 const char *end;
5796 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 PyObject *errorHandler = NULL;
5798 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005799
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005800 if (size == 0)
5801 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 /* Escaped strings will always be longer than the resulting
5804 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 length after conversion to the true value. (But decoding error
5806 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005807 _PyUnicodeWriter_Init(&writer, 1);
5808 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005810
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 end = s + size;
5812 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 unsigned char c;
5814 Py_UCS4 x;
5815 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005816 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 /* Non-escape characters are interpreted as Unicode ordinals */
5819 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005820 x = (unsigned char)*s++;
5821 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005822 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005823 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5824 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005826 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 startinpos = s-starts;
5828
5829 /* \u-escapes are only interpreted iff the number of leading
5830 backslashes if odd */
5831 bs = s;
5832 for (;s < end;) {
5833 if (*s != '\\')
5834 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005835 x = (unsigned char)*s++;
5836 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005837 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005838 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5839 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 }
5841 if (((s - bs) & 1) == 0 ||
5842 s >= end ||
5843 (*s != 'u' && *s != 'U')) {
5844 continue;
5845 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005846 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 count = *s=='u' ? 4 : 8;
5848 s++;
5849
5850 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 for (x = 0, i = 0; i < count; ++i, ++s) {
5852 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005853 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 errors, &errorHandler,
5857 "rawunicodeescape", "truncated \\uXXXX",
5858 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005859 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 goto onError;
5861 goto nextByte;
5862 }
5863 x = (x<<4) & ~0xF;
5864 if (c >= '0' && c <= '9')
5865 x += c - '0';
5866 else if (c >= 'a' && c <= 'f')
5867 x += 10 + c - 'a';
5868 else
5869 x += 10 + c - 'A';
5870 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005871 if (x <= MAX_UNICODE) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005872 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005873 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005874 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5875 writer.pos++;
5876 }
5877 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005878 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005880 errors, &errorHandler,
5881 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005883 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005885 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 nextByte:
5887 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 Py_XDECREF(errorHandler);
5890 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005891 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005892
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 return NULL;
5898}
5899
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900
Alexander Belopolsky40018472011-02-26 01:02:56 +00005901PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005904 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 char *p;
5906 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 Py_ssize_t expandsize, pos;
5908 int kind;
5909 void *data;
5910 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005912 if (!PyUnicode_Check(unicode)) {
5913 PyErr_BadArgument();
5914 return NULL;
5915 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005916 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005917 return NULL;
5918 kind = PyUnicode_KIND(unicode);
5919 data = PyUnicode_DATA(unicode);
5920 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005921 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5922 bytes, and 1 byte characters 4. */
5923 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005924
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005925 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005927
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 if (repr == NULL)
5930 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005932 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005934 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005935 for (pos = 0; pos < len; pos++) {
5936 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 /* Map 32-bit characters to '\Uxxxxxxxx' */
5938 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005939 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005940 *p++ = '\\';
5941 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005942 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5943 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5944 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5945 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5946 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5947 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5948 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5949 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005950 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 *p++ = '\\';
5954 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005955 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5956 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5957 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5958 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 /* Copy everything else as-is */
5961 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 *p++ = (char) ch;
5963 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005964
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005965 assert(p > q);
5966 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005967 return NULL;
5968 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969}
5970
Alexander Belopolsky40018472011-02-26 01:02:56 +00005971PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5973 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005975 PyObject *result;
5976 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5977 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005978 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005979 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5980 Py_DECREF(tmp);
5981 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982}
5983
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005984/* --- Unicode Internal Codec ------------------------------------------- */
5985
Alexander Belopolsky40018472011-02-26 01:02:56 +00005986PyObject *
5987_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005988 Py_ssize_t size,
5989 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005990{
5991 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005992 Py_ssize_t startinpos;
5993 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005994 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005995 const char *end;
5996 const char *reason;
5997 PyObject *errorHandler = NULL;
5998 PyObject *exc = NULL;
5999
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006000 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006001 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006002 1))
6003 return NULL;
6004
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006005 if (size == 0)
6006 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006007
Thomas Wouters89f507f2006-12-13 04:49:30 +00006008 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006009 _PyUnicodeWriter_Init(&writer, 0);
6010 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006012 end = s + size;
6013
6014 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006015 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006016 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006017 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006018 endinpos = end-starts;
6019 reason = "truncated input";
6020 goto error;
6021 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006022 /* We copy the raw representation one byte at a time because the
6023 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006024 ((char *) &uch)[0] = s[0];
6025 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006026#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006027 ((char *) &uch)[2] = s[2];
6028 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006029#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006030 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006031#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006032 /* We have to sanity check the raw data, otherwise doom looms for
6033 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006034 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006035 endinpos = s - starts + Py_UNICODE_SIZE;
6036 reason = "illegal code point (> 0x10FFFF)";
6037 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006038 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006039#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006040 s += Py_UNICODE_SIZE;
6041#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006042 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006043 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006044 Py_UNICODE uch2;
6045 ((char *) &uch2)[0] = s[0];
6046 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006047 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006048 {
Victor Stinner551ac952011-11-29 22:58:13 +01006049 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006050 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006051 }
6052 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006053#endif
6054
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006055 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006056 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006057 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
6058 writer.pos++;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006059 continue;
6060
6061 error:
6062 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006063 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006064 errors, &errorHandler,
6065 "unicode_internal", reason,
6066 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006067 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006068 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006069 }
6070
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006071 Py_XDECREF(errorHandler);
6072 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006073 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006074
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006076 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006077 Py_XDECREF(errorHandler);
6078 Py_XDECREF(exc);
6079 return NULL;
6080}
6081
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082/* --- Latin-1 Codec ------------------------------------------------------ */
6083
Alexander Belopolsky40018472011-02-26 01:02:56 +00006084PyObject *
6085PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006086 Py_ssize_t size,
6087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006090 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091}
6092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006094static void
6095make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006096 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006097 PyObject *unicode,
6098 Py_ssize_t startpos, Py_ssize_t endpos,
6099 const char *reason)
6100{
6101 if (*exceptionObject == NULL) {
6102 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006103 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006104 encoding, unicode, startpos, endpos, reason);
6105 }
6106 else {
6107 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6108 goto onError;
6109 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6110 goto onError;
6111 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6112 goto onError;
6113 return;
6114 onError:
6115 Py_DECREF(*exceptionObject);
6116 *exceptionObject = NULL;
6117 }
6118}
6119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006121static void
6122raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006123 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006124 PyObject *unicode,
6125 Py_ssize_t startpos, Py_ssize_t endpos,
6126 const char *reason)
6127{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006128 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006129 encoding, unicode, startpos, endpos, reason);
6130 if (*exceptionObject != NULL)
6131 PyCodec_StrictErrors(*exceptionObject);
6132}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133
6134/* error handling callback helper:
6135 build arguments, call the callback and check the arguments,
6136 put the result into newpos and return the replacement string, which
6137 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006138static PyObject *
6139unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006140 PyObject **errorHandler,
6141 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006142 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006143 Py_ssize_t startpos, Py_ssize_t endpos,
6144 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006146 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006147 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 PyObject *restuple;
6149 PyObject *resunicode;
6150
6151 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006155 }
6156
Benjamin Petersonbac79492012-01-14 13:34:47 -05006157 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006158 return NULL;
6159 len = PyUnicode_GET_LENGTH(unicode);
6160
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006161 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006162 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165
6166 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006168 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006171 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 Py_DECREF(restuple);
6173 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006175 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 &resunicode, newpos)) {
6177 Py_DECREF(restuple);
6178 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006180 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6181 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6182 Py_DECREF(restuple);
6183 return NULL;
6184 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006186 *newpos = len + *newpos;
6187 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6189 Py_DECREF(restuple);
6190 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006191 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 Py_INCREF(resunicode);
6193 Py_DECREF(restuple);
6194 return resunicode;
6195}
6196
Alexander Belopolsky40018472011-02-26 01:02:56 +00006197static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006198unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006199 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006200 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006202 /* input state */
6203 Py_ssize_t pos=0, size;
6204 int kind;
6205 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006206 /* output object */
6207 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 /* pointer into the output */
6209 char *str;
6210 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006211 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006212 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6213 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 PyObject *errorHandler = NULL;
6215 PyObject *exc = NULL;
6216 /* the following variable is used for caching string comparisons
6217 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6218 int known_errorHandler = -1;
6219
Benjamin Petersonbac79492012-01-14 13:34:47 -05006220 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006221 return NULL;
6222 size = PyUnicode_GET_LENGTH(unicode);
6223 kind = PyUnicode_KIND(unicode);
6224 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006225 /* allocate enough for a simple encoding without
6226 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006227 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006228 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006229 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006230 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006231 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006232 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006233 ressize = size;
6234
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006235 while (pos < size) {
6236 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006237
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 /* can we encode this? */
6239 if (c<limit) {
6240 /* no overflow check, because we know that the space is enough */
6241 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006242 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006243 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 Py_ssize_t requiredsize;
6246 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006247 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006249 Py_ssize_t collstart = pos;
6250 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006252 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 ++collend;
6254 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6255 if (known_errorHandler==-1) {
6256 if ((errors==NULL) || (!strcmp(errors, "strict")))
6257 known_errorHandler = 1;
6258 else if (!strcmp(errors, "replace"))
6259 known_errorHandler = 2;
6260 else if (!strcmp(errors, "ignore"))
6261 known_errorHandler = 3;
6262 else if (!strcmp(errors, "xmlcharrefreplace"))
6263 known_errorHandler = 4;
6264 else
6265 known_errorHandler = 0;
6266 }
6267 switch (known_errorHandler) {
6268 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006269 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 goto onError;
6271 case 2: /* replace */
6272 while (collstart++<collend)
6273 *str++ = '?'; /* fall through */
6274 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 break;
6277 case 4: /* xmlcharrefreplace */
6278 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006279 /* determine replacement size */
6280 for (i = collstart, repsize = 0; i < collend; ++i) {
6281 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6282 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006284 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006286 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006288 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006290 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006292 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006294 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006295 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006297 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006299 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 if (requiredsize > ressize) {
6301 if (requiredsize<2*ressize)
6302 requiredsize = 2*ressize;
6303 if (_PyBytes_Resize(&res, requiredsize))
6304 goto onError;
6305 str = PyBytes_AS_STRING(res) + respos;
6306 ressize = requiredsize;
6307 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006308 /* generate replacement */
6309 for (i = collstart; i < collend; ++i) {
6310 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006312 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 break;
6314 default:
6315 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006316 encoding, reason, unicode, &exc,
6317 collstart, collend, &newpos);
6318 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006319 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006321 if (PyBytes_Check(repunicode)) {
6322 /* Directly copy bytes result to output. */
6323 repsize = PyBytes_Size(repunicode);
6324 if (repsize > 1) {
6325 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006326 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006327 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6328 Py_DECREF(repunicode);
6329 goto onError;
6330 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006331 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006332 ressize += repsize-1;
6333 }
6334 memcpy(str, PyBytes_AsString(repunicode), repsize);
6335 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006336 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006337 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006338 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006339 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 /* need more space? (at least enough for what we
6341 have+the replacement+the rest of the string, so
6342 we won't have to check space for encodable characters) */
6343 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006344 repsize = PyUnicode_GET_LENGTH(repunicode);
6345 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 if (requiredsize > ressize) {
6347 if (requiredsize<2*ressize)
6348 requiredsize = 2*ressize;
6349 if (_PyBytes_Resize(&res, requiredsize)) {
6350 Py_DECREF(repunicode);
6351 goto onError;
6352 }
6353 str = PyBytes_AS_STRING(res) + respos;
6354 ressize = requiredsize;
6355 }
6356 /* check if there is anything unencodable in the replacement
6357 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358 for (i = 0; repsize-->0; ++i, ++str) {
6359 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006361 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006362 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 Py_DECREF(repunicode);
6364 goto onError;
6365 }
6366 *str = (char)c;
6367 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006369 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006370 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006371 }
6372 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006373 /* Resize if we allocated to much */
6374 size = str - PyBytes_AS_STRING(res);
6375 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006376 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006377 if (_PyBytes_Resize(&res, size) < 0)
6378 goto onError;
6379 }
6380
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381 Py_XDECREF(errorHandler);
6382 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006383 return res;
6384
6385 onError:
6386 Py_XDECREF(res);
6387 Py_XDECREF(errorHandler);
6388 Py_XDECREF(exc);
6389 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390}
6391
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006393PyObject *
6394PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006395 Py_ssize_t size,
6396 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 PyObject *result;
6399 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6400 if (unicode == NULL)
6401 return NULL;
6402 result = unicode_encode_ucs1(unicode, errors, 256);
6403 Py_DECREF(unicode);
6404 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
Alexander Belopolsky40018472011-02-26 01:02:56 +00006407PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006408_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409{
6410 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 PyErr_BadArgument();
6412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006414 if (PyUnicode_READY(unicode) == -1)
6415 return NULL;
6416 /* Fast path: if it is a one-byte string, construct
6417 bytes object directly. */
6418 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6419 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6420 PyUnicode_GET_LENGTH(unicode));
6421 /* Non-Latin-1 characters present. Defer to above function to
6422 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006423 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006424}
6425
6426PyObject*
6427PyUnicode_AsLatin1String(PyObject *unicode)
6428{
6429 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430}
6431
6432/* --- 7-bit ASCII Codec -------------------------------------------------- */
6433
Alexander Belopolsky40018472011-02-26 01:02:56 +00006434PyObject *
6435PyUnicode_DecodeASCII(const char *s,
6436 Py_ssize_t size,
6437 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006440 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006441 int kind;
6442 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006443 Py_ssize_t startinpos;
6444 Py_ssize_t endinpos;
6445 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 const char *e;
6447 PyObject *errorHandler = NULL;
6448 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006449
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006451 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006454 if (size == 1 && (unsigned char)s[0] < 128)
6455 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006456
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006457 _PyUnicodeWriter_Init(&writer, 0);
6458 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006462 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006463 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006464 writer.pos = outpos;
6465 if (writer.pos == size)
6466 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006467
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006468 s += writer.pos;
6469 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 register unsigned char c = (unsigned char)*s;
6472 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006473 PyUnicode_WRITE(kind, data, writer.pos, c);
6474 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 ++s;
6476 }
6477 else {
6478 startinpos = s-starts;
6479 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006480 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 errors, &errorHandler,
6482 "ascii", "ordinal not in range(128)",
6483 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006484 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006486 kind = writer.kind;
6487 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490 Py_XDECREF(errorHandler);
6491 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006492 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006493
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006495 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496 Py_XDECREF(errorHandler);
6497 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 return NULL;
6499}
6500
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502PyObject *
6503PyUnicode_EncodeASCII(const Py_UNICODE *p,
6504 Py_ssize_t size,
6505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 PyObject *result;
6508 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6509 if (unicode == NULL)
6510 return NULL;
6511 result = unicode_encode_ucs1(unicode, errors, 128);
6512 Py_DECREF(unicode);
6513 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514}
6515
Alexander Belopolsky40018472011-02-26 01:02:56 +00006516PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006517_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518{
6519 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 PyErr_BadArgument();
6521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006523 if (PyUnicode_READY(unicode) == -1)
6524 return NULL;
6525 /* Fast path: if it is an ASCII-only string, construct bytes object
6526 directly. Else defer to above function to raise the exception. */
6527 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6528 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6529 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006531}
6532
6533PyObject *
6534PyUnicode_AsASCIIString(PyObject *unicode)
6535{
6536 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537}
6538
Victor Stinner99b95382011-07-04 14:23:54 +02006539#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006540
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006541/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006542
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006543#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006544#define NEED_RETRY
6545#endif
6546
Victor Stinner3a50e702011-10-18 21:21:00 +02006547#ifndef WC_ERR_INVALID_CHARS
6548# define WC_ERR_INVALID_CHARS 0x0080
6549#endif
6550
6551static char*
6552code_page_name(UINT code_page, PyObject **obj)
6553{
6554 *obj = NULL;
6555 if (code_page == CP_ACP)
6556 return "mbcs";
6557 if (code_page == CP_UTF7)
6558 return "CP_UTF7";
6559 if (code_page == CP_UTF8)
6560 return "CP_UTF8";
6561
6562 *obj = PyBytes_FromFormat("cp%u", code_page);
6563 if (*obj == NULL)
6564 return NULL;
6565 return PyBytes_AS_STRING(*obj);
6566}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006567
Alexander Belopolsky40018472011-02-26 01:02:56 +00006568static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006569is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006570{
6571 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006572 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573
Victor Stinner3a50e702011-10-18 21:21:00 +02006574 if (!IsDBCSLeadByteEx(code_page, *curr))
6575 return 0;
6576
6577 prev = CharPrevExA(code_page, s, curr, 0);
6578 if (prev == curr)
6579 return 1;
6580 /* FIXME: This code is limited to "true" double-byte encodings,
6581 as it assumes an incomplete character consists of a single
6582 byte. */
6583 if (curr - prev == 2)
6584 return 1;
6585 if (!IsDBCSLeadByteEx(code_page, *prev))
6586 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006587 return 0;
6588}
6589
Victor Stinner3a50e702011-10-18 21:21:00 +02006590static DWORD
6591decode_code_page_flags(UINT code_page)
6592{
6593 if (code_page == CP_UTF7) {
6594 /* The CP_UTF7 decoder only supports flags=0 */
6595 return 0;
6596 }
6597 else
6598 return MB_ERR_INVALID_CHARS;
6599}
6600
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006601/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006602 * Decode a byte string from a Windows code page into unicode object in strict
6603 * mode.
6604 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006605 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6606 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006607 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006608static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006609decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006610 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006611 const char *in,
6612 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006613{
Victor Stinner3a50e702011-10-18 21:21:00 +02006614 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006615 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006616 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617
6618 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006619 assert(insize > 0);
6620 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6621 if (outsize <= 0)
6622 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006623
6624 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006626 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006627 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 if (*v == NULL)
6629 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006630 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006631 }
6632 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006634 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006635 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006637 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638 }
6639
6640 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006641 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6642 if (outsize <= 0)
6643 goto error;
6644 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006645
Victor Stinner3a50e702011-10-18 21:21:00 +02006646error:
6647 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6648 return -2;
6649 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006650 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006651}
6652
Victor Stinner3a50e702011-10-18 21:21:00 +02006653/*
6654 * Decode a byte string from a code page into unicode object with an error
6655 * handler.
6656 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006657 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006658 * UnicodeDecodeError exception and returns -1 on error.
6659 */
6660static int
6661decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006662 PyObject **v,
6663 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006664 const char *errors)
6665{
6666 const char *startin = in;
6667 const char *endin = in + size;
6668 const DWORD flags = decode_code_page_flags(code_page);
6669 /* Ideally, we should get reason from FormatMessage. This is the Windows
6670 2000 English version of the message. */
6671 const char *reason = "No mapping for the Unicode character exists "
6672 "in the target code page.";
6673 /* each step cannot decode more than 1 character, but a character can be
6674 represented as a surrogate pair */
6675 wchar_t buffer[2], *startout, *out;
6676 int insize, outsize;
6677 PyObject *errorHandler = NULL;
6678 PyObject *exc = NULL;
6679 PyObject *encoding_obj = NULL;
6680 char *encoding;
6681 DWORD err;
6682 int ret = -1;
6683
6684 assert(size > 0);
6685
6686 encoding = code_page_name(code_page, &encoding_obj);
6687 if (encoding == NULL)
6688 return -1;
6689
6690 if (errors == NULL || strcmp(errors, "strict") == 0) {
6691 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6692 UnicodeDecodeError. */
6693 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6694 if (exc != NULL) {
6695 PyCodec_StrictErrors(exc);
6696 Py_CLEAR(exc);
6697 }
6698 goto error;
6699 }
6700
6701 if (*v == NULL) {
6702 /* Create unicode object */
6703 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6704 PyErr_NoMemory();
6705 goto error;
6706 }
Victor Stinnerab595942011-12-17 04:59:06 +01006707 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006708 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006709 if (*v == NULL)
6710 goto error;
6711 startout = PyUnicode_AS_UNICODE(*v);
6712 }
6713 else {
6714 /* Extend unicode object */
6715 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6716 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6717 PyErr_NoMemory();
6718 goto error;
6719 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006720 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006721 goto error;
6722 startout = PyUnicode_AS_UNICODE(*v) + n;
6723 }
6724
6725 /* Decode the byte string character per character */
6726 out = startout;
6727 while (in < endin)
6728 {
6729 /* Decode a character */
6730 insize = 1;
6731 do
6732 {
6733 outsize = MultiByteToWideChar(code_page, flags,
6734 in, insize,
6735 buffer, Py_ARRAY_LENGTH(buffer));
6736 if (outsize > 0)
6737 break;
6738 err = GetLastError();
6739 if (err != ERROR_NO_UNICODE_TRANSLATION
6740 && err != ERROR_INSUFFICIENT_BUFFER)
6741 {
6742 PyErr_SetFromWindowsErr(0);
6743 goto error;
6744 }
6745 insize++;
6746 }
6747 /* 4=maximum length of a UTF-8 sequence */
6748 while (insize <= 4 && (in + insize) <= endin);
6749
6750 if (outsize <= 0) {
6751 Py_ssize_t startinpos, endinpos, outpos;
6752
6753 startinpos = in - startin;
6754 endinpos = startinpos + 1;
6755 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006756 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006757 errors, &errorHandler,
6758 encoding, reason,
6759 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006760 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006761 {
6762 goto error;
6763 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006764 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006765 }
6766 else {
6767 in += insize;
6768 memcpy(out, buffer, outsize * sizeof(wchar_t));
6769 out += outsize;
6770 }
6771 }
6772
6773 /* write a NUL character at the end */
6774 *out = 0;
6775
6776 /* Extend unicode object */
6777 outsize = out - startout;
6778 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006779 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006780 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006781 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006782
6783error:
6784 Py_XDECREF(encoding_obj);
6785 Py_XDECREF(errorHandler);
6786 Py_XDECREF(exc);
6787 return ret;
6788}
6789
Victor Stinner3a50e702011-10-18 21:21:00 +02006790static PyObject *
6791decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006792 const char *s, Py_ssize_t size,
6793 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794{
Victor Stinner76a31a62011-11-04 00:05:13 +01006795 PyObject *v = NULL;
6796 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006797
Victor Stinner3a50e702011-10-18 21:21:00 +02006798 if (code_page < 0) {
6799 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6800 return NULL;
6801 }
6802
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006805
Victor Stinner76a31a62011-11-04 00:05:13 +01006806 do
6807 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006808#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006809 if (size > INT_MAX) {
6810 chunk_size = INT_MAX;
6811 final = 0;
6812 done = 0;
6813 }
6814 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006815#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006816 {
6817 chunk_size = (int)size;
6818 final = (consumed == NULL);
6819 done = 1;
6820 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006821
Victor Stinner76a31a62011-11-04 00:05:13 +01006822 /* Skip trailing lead-byte unless 'final' is set */
6823 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6824 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825
Victor Stinner76a31a62011-11-04 00:05:13 +01006826 if (chunk_size == 0 && done) {
6827 if (v != NULL)
6828 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006829 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006830 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831
Victor Stinner76a31a62011-11-04 00:05:13 +01006832
6833 converted = decode_code_page_strict(code_page, &v,
6834 s, chunk_size);
6835 if (converted == -2)
6836 converted = decode_code_page_errors(code_page, &v,
6837 s, chunk_size,
6838 errors);
6839 assert(converted != 0);
6840
6841 if (converted < 0) {
6842 Py_XDECREF(v);
6843 return NULL;
6844 }
6845
6846 if (consumed)
6847 *consumed += converted;
6848
6849 s += converted;
6850 size -= converted;
6851 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006852
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006853 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006854}
6855
Alexander Belopolsky40018472011-02-26 01:02:56 +00006856PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006857PyUnicode_DecodeCodePageStateful(int code_page,
6858 const char *s,
6859 Py_ssize_t size,
6860 const char *errors,
6861 Py_ssize_t *consumed)
6862{
6863 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6864}
6865
6866PyObject *
6867PyUnicode_DecodeMBCSStateful(const char *s,
6868 Py_ssize_t size,
6869 const char *errors,
6870 Py_ssize_t *consumed)
6871{
6872 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6873}
6874
6875PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006876PyUnicode_DecodeMBCS(const char *s,
6877 Py_ssize_t size,
6878 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006879{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6881}
6882
Victor Stinner3a50e702011-10-18 21:21:00 +02006883static DWORD
6884encode_code_page_flags(UINT code_page, const char *errors)
6885{
6886 if (code_page == CP_UTF8) {
6887 if (winver.dwMajorVersion >= 6)
6888 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6889 and later */
6890 return WC_ERR_INVALID_CHARS;
6891 else
6892 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6893 return 0;
6894 }
6895 else if (code_page == CP_UTF7) {
6896 /* CP_UTF7 only supports flags=0 */
6897 return 0;
6898 }
6899 else {
6900 if (errors != NULL && strcmp(errors, "replace") == 0)
6901 return 0;
6902 else
6903 return WC_NO_BEST_FIT_CHARS;
6904 }
6905}
6906
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 * Encode a Unicode string to a Windows code page into a byte string in strict
6909 * mode.
6910 *
6911 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006912 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006913 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006914static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006915encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006916 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918{
Victor Stinner554f3f02010-06-16 23:33:54 +00006919 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006920 BOOL *pusedDefaultChar = &usedDefaultChar;
6921 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006922 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006923 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006924 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006925 const DWORD flags = encode_code_page_flags(code_page, NULL);
6926 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006927 /* Create a substring so that we can get the UTF-16 representation
6928 of just the slice under consideration. */
6929 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006930
Martin v. Löwis3d325192011-11-04 18:23:06 +01006931 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006932
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006934 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006935 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006936 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006937
Victor Stinner2fc507f2011-11-04 20:06:39 +01006938 substring = PyUnicode_Substring(unicode, offset, offset+len);
6939 if (substring == NULL)
6940 return -1;
6941 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6942 if (p == NULL) {
6943 Py_DECREF(substring);
6944 return -1;
6945 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006946
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006947 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 outsize = WideCharToMultiByte(code_page, flags,
6949 p, size,
6950 NULL, 0,
6951 NULL, pusedDefaultChar);
6952 if (outsize <= 0)
6953 goto error;
6954 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006955 if (pusedDefaultChar && *pusedDefaultChar) {
6956 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006958 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006959
Victor Stinner3a50e702011-10-18 21:21:00 +02006960 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006962 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006963 if (*outbytes == NULL) {
6964 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006966 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006967 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968 }
6969 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 const Py_ssize_t n = PyBytes_Size(*outbytes);
6972 if (outsize > PY_SSIZE_T_MAX - n) {
6973 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006974 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006976 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006977 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6978 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006980 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982 }
6983
6984 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006985 outsize = WideCharToMultiByte(code_page, flags,
6986 p, size,
6987 out, outsize,
6988 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006989 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006990 if (outsize <= 0)
6991 goto error;
6992 if (pusedDefaultChar && *pusedDefaultChar)
6993 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006995
Victor Stinner3a50e702011-10-18 21:21:00 +02006996error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006997 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006998 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6999 return -2;
7000 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007001 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007002}
7003
Victor Stinner3a50e702011-10-18 21:21:00 +02007004/*
7005 * Encode a Unicode string to a Windows code page into a byte string using a
7006 * error handler.
7007 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007008 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 * -1 on other error.
7010 */
7011static int
7012encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007013 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007014 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007015{
Victor Stinner3a50e702011-10-18 21:21:00 +02007016 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007017 Py_ssize_t pos = unicode_offset;
7018 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007019 /* Ideally, we should get reason from FormatMessage. This is the Windows
7020 2000 English version of the message. */
7021 const char *reason = "invalid character";
7022 /* 4=maximum length of a UTF-8 sequence */
7023 char buffer[4];
7024 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7025 Py_ssize_t outsize;
7026 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007027 PyObject *errorHandler = NULL;
7028 PyObject *exc = NULL;
7029 PyObject *encoding_obj = NULL;
7030 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007031 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007032 PyObject *rep;
7033 int ret = -1;
7034
7035 assert(insize > 0);
7036
7037 encoding = code_page_name(code_page, &encoding_obj);
7038 if (encoding == NULL)
7039 return -1;
7040
7041 if (errors == NULL || strcmp(errors, "strict") == 0) {
7042 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7043 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007044 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007045 if (exc != NULL) {
7046 PyCodec_StrictErrors(exc);
7047 Py_DECREF(exc);
7048 }
7049 Py_XDECREF(encoding_obj);
7050 return -1;
7051 }
7052
7053 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7054 pusedDefaultChar = &usedDefaultChar;
7055 else
7056 pusedDefaultChar = NULL;
7057
7058 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7059 PyErr_NoMemory();
7060 goto error;
7061 }
7062 outsize = insize * Py_ARRAY_LENGTH(buffer);
7063
7064 if (*outbytes == NULL) {
7065 /* Create string object */
7066 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7067 if (*outbytes == NULL)
7068 goto error;
7069 out = PyBytes_AS_STRING(*outbytes);
7070 }
7071 else {
7072 /* Extend string object */
7073 Py_ssize_t n = PyBytes_Size(*outbytes);
7074 if (n > PY_SSIZE_T_MAX - outsize) {
7075 PyErr_NoMemory();
7076 goto error;
7077 }
7078 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7079 goto error;
7080 out = PyBytes_AS_STRING(*outbytes) + n;
7081 }
7082
7083 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007084 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007085 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007086 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7087 wchar_t chars[2];
7088 int charsize;
7089 if (ch < 0x10000) {
7090 chars[0] = (wchar_t)ch;
7091 charsize = 1;
7092 }
7093 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007094 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7095 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007096 charsize = 2;
7097 }
7098
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007100 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 buffer, Py_ARRAY_LENGTH(buffer),
7102 NULL, pusedDefaultChar);
7103 if (outsize > 0) {
7104 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7105 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007106 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 memcpy(out, buffer, outsize);
7108 out += outsize;
7109 continue;
7110 }
7111 }
7112 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7113 PyErr_SetFromWindowsErr(0);
7114 goto error;
7115 }
7116
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 rep = unicode_encode_call_errorhandler(
7118 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007119 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007120 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 if (rep == NULL)
7122 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007123 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007124
7125 if (PyBytes_Check(rep)) {
7126 outsize = PyBytes_GET_SIZE(rep);
7127 if (outsize != 1) {
7128 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7129 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7130 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7131 Py_DECREF(rep);
7132 goto error;
7133 }
7134 out = PyBytes_AS_STRING(*outbytes) + offset;
7135 }
7136 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7137 out += outsize;
7138 }
7139 else {
7140 Py_ssize_t i;
7141 enum PyUnicode_Kind kind;
7142 void *data;
7143
Benjamin Petersonbac79492012-01-14 13:34:47 -05007144 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 Py_DECREF(rep);
7146 goto error;
7147 }
7148
7149 outsize = PyUnicode_GET_LENGTH(rep);
7150 if (outsize != 1) {
7151 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7152 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7153 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7154 Py_DECREF(rep);
7155 goto error;
7156 }
7157 out = PyBytes_AS_STRING(*outbytes) + offset;
7158 }
7159 kind = PyUnicode_KIND(rep);
7160 data = PyUnicode_DATA(rep);
7161 for (i=0; i < outsize; i++) {
7162 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7163 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007164 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007165 encoding, unicode,
7166 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 "unable to encode error handler result to ASCII");
7168 Py_DECREF(rep);
7169 goto error;
7170 }
7171 *out = (unsigned char)ch;
7172 out++;
7173 }
7174 }
7175 Py_DECREF(rep);
7176 }
7177 /* write a NUL byte */
7178 *out = 0;
7179 outsize = out - PyBytes_AS_STRING(*outbytes);
7180 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7181 if (_PyBytes_Resize(outbytes, outsize) < 0)
7182 goto error;
7183 ret = 0;
7184
7185error:
7186 Py_XDECREF(encoding_obj);
7187 Py_XDECREF(errorHandler);
7188 Py_XDECREF(exc);
7189 return ret;
7190}
7191
Victor Stinner3a50e702011-10-18 21:21:00 +02007192static PyObject *
7193encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007194 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 const char *errors)
7196{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007197 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007199 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007200 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007201
Benjamin Petersonbac79492012-01-14 13:34:47 -05007202 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007203 return NULL;
7204 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007205
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 if (code_page < 0) {
7207 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7208 return NULL;
7209 }
7210
Martin v. Löwis3d325192011-11-04 18:23:06 +01007211 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007212 return PyBytes_FromStringAndSize(NULL, 0);
7213
Victor Stinner7581cef2011-11-03 22:32:33 +01007214 offset = 0;
7215 do
7216 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007218 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007219 chunks. */
7220 if (len > INT_MAX/2) {
7221 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007222 done = 0;
7223 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007224 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007225#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007226 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007227 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007228 done = 1;
7229 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230
Victor Stinner76a31a62011-11-04 00:05:13 +01007231 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007232 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007233 errors);
7234 if (ret == -2)
7235 ret = encode_code_page_errors(code_page, &outbytes,
7236 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007237 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007238 if (ret < 0) {
7239 Py_XDECREF(outbytes);
7240 return NULL;
7241 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007242
Victor Stinner7581cef2011-11-03 22:32:33 +01007243 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007244 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007245 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007246
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 return outbytes;
7248}
7249
7250PyObject *
7251PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7252 Py_ssize_t size,
7253 const char *errors)
7254{
Victor Stinner7581cef2011-11-03 22:32:33 +01007255 PyObject *unicode, *res;
7256 unicode = PyUnicode_FromUnicode(p, size);
7257 if (unicode == NULL)
7258 return NULL;
7259 res = encode_code_page(CP_ACP, unicode, errors);
7260 Py_DECREF(unicode);
7261 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007262}
7263
7264PyObject *
7265PyUnicode_EncodeCodePage(int code_page,
7266 PyObject *unicode,
7267 const char *errors)
7268{
Victor Stinner7581cef2011-11-03 22:32:33 +01007269 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007270}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007271
Alexander Belopolsky40018472011-02-26 01:02:56 +00007272PyObject *
7273PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007274{
7275 if (!PyUnicode_Check(unicode)) {
7276 PyErr_BadArgument();
7277 return NULL;
7278 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007279 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007280}
7281
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007282#undef NEED_RETRY
7283
Victor Stinner99b95382011-07-04 14:23:54 +02007284#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007285
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286/* --- Character Mapping Codec -------------------------------------------- */
7287
Alexander Belopolsky40018472011-02-26 01:02:56 +00007288PyObject *
7289PyUnicode_DecodeCharmap(const char *s,
7290 Py_ssize_t size,
7291 PyObject *mapping,
7292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007294 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007295 Py_ssize_t startinpos;
7296 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007297 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007298 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299 PyObject *errorHandler = NULL;
7300 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007301
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 /* Default to Latin-1 */
7303 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007307 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007308 _PyUnicodeWriter_Init(&writer, 0);
7309 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007313 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007314 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007315 enum PyUnicode_Kind mapkind;
7316 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007317 Py_UCS4 x;
Victor Stinner03c3e352013-04-09 21:53:09 +02007318 unsigned char ch;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007319
Benjamin Petersonbac79492012-01-14 13:34:47 -05007320 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007321 return NULL;
7322
7323 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007324 mapdata = PyUnicode_DATA(mapping);
7325 mapkind = PyUnicode_KIND(mapping);
Victor Stinner03c3e352013-04-09 21:53:09 +02007326
7327 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7328 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7329 * is disabled in encoding aliases, latin1 is preferred because
7330 * its implementation is faster. */
7331 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7332 Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
7333 Py_UCS4 maxchar = writer.maxchar;
7334
7335 assert (writer.kind == PyUnicode_1BYTE_KIND);
7336 while (s < e) {
7337 ch = *s;
7338 x = mapdata_ucs1[ch];
7339 if (x > maxchar) {
7340 if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
7341 goto onError;
7342 maxchar = writer.maxchar;
7343 outdata = (Py_UCS1 *)writer.data;
7344 }
7345 outdata[writer.pos] = x;
7346 writer.pos++;
7347 ++s;
7348 }
7349 }
7350
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007352 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007353 enum PyUnicode_Kind outkind = writer.kind;
7354 void *outdata = writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007355 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007356 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007357 while (s < e) {
7358 unsigned char ch = *s;
7359 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7360 if (x > maxchar)
7361 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007362 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x);
7363 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007364 ++s;
7365 }
7366 break;
7367 }
7368 else if (outkind == PyUnicode_2BYTE_KIND) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007369 while (s < e) {
7370 unsigned char ch = *s;
7371 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7372 if (x == 0xFFFE)
7373 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007374 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x);
7375 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007376 ++s;
7377 }
7378 break;
7379 }
7380 }
7381 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007384 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007385 else
7386 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007387Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007388 if (x == 0xfffe)
7389 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 startinpos = s-starts;
7392 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007393 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 errors, &errorHandler,
7395 "charmap", "character maps to <undefined>",
7396 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007397 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 goto onError;
7399 }
7400 continue;
7401 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007402
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007403 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007404 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007405 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
7406 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007408 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007409 }
7410 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 while (s < e) {
7412 unsigned char ch = *s;
7413 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007414
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7416 w = PyLong_FromLong((long)ch);
7417 if (w == NULL)
7418 goto onError;
7419 x = PyObject_GetItem(mapping, w);
7420 Py_DECREF(w);
7421 if (x == NULL) {
7422 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7423 /* No mapping found means: mapping is undefined. */
7424 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007425 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 } else
7427 goto onError;
7428 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007429
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007431 if (x == Py_None)
7432 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 if (PyLong_Check(x)) {
7434 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007435 if (value == 0xFFFE)
7436 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007437 if (value < 0 || value > MAX_UNICODE) {
7438 PyErr_Format(PyExc_TypeError,
7439 "character mapping must be in range(0x%lx)",
7440 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 Py_DECREF(x);
7442 goto onError;
7443 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007444
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007445 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007446 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007447 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007448 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007449 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7450 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 else if (PyUnicode_Check(x)) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007453 if (PyUnicode_READY(x) == -1) {
7454 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007455 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007456 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007457 if (PyUnicode_GET_LENGTH(x) == 1) {
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007458 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007459 if (value == 0xFFFE)
7460 goto Undefined;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007461 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1) {
7462 Py_DECREF(x);
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007463 goto onError;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007464 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007465 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7466 writer.pos++;
7467 }
7468 else {
7469 writer.overallocate = 1;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007470 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007471 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007472 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007473 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007474 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 }
7476 else {
7477 /* wrong return value */
7478 PyErr_SetString(PyExc_TypeError,
7479 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480 Py_DECREF(x);
7481 goto onError;
7482 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 Py_DECREF(x);
7484 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007485 continue;
7486Undefined:
7487 /* undefined mapping */
7488 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007489 startinpos = s-starts;
7490 endinpos = startinpos+1;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007491 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007492 errors, &errorHandler,
7493 "charmap", "character maps to <undefined>",
7494 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007495 &writer)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007496 goto onError;
7497 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007500 Py_XDECREF(errorHandler);
7501 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007502 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007503
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007505 Py_XDECREF(errorHandler);
7506 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007507 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 return NULL;
7509}
7510
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007511/* Charmap encoding: the lookup table */
7512
Alexander Belopolsky40018472011-02-26 01:02:56 +00007513struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 PyObject_HEAD
7515 unsigned char level1[32];
7516 int count2, count3;
7517 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007518};
7519
7520static PyObject*
7521encoding_map_size(PyObject *obj, PyObject* args)
7522{
7523 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007524 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007526}
7527
7528static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007529 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 PyDoc_STR("Return the size (in bytes) of this object") },
7531 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007532};
7533
7534static void
7535encoding_map_dealloc(PyObject* o)
7536{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007537 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007538}
7539
7540static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007541 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 "EncodingMap", /*tp_name*/
7543 sizeof(struct encoding_map), /*tp_basicsize*/
7544 0, /*tp_itemsize*/
7545 /* methods */
7546 encoding_map_dealloc, /*tp_dealloc*/
7547 0, /*tp_print*/
7548 0, /*tp_getattr*/
7549 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007550 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 0, /*tp_repr*/
7552 0, /*tp_as_number*/
7553 0, /*tp_as_sequence*/
7554 0, /*tp_as_mapping*/
7555 0, /*tp_hash*/
7556 0, /*tp_call*/
7557 0, /*tp_str*/
7558 0, /*tp_getattro*/
7559 0, /*tp_setattro*/
7560 0, /*tp_as_buffer*/
7561 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7562 0, /*tp_doc*/
7563 0, /*tp_traverse*/
7564 0, /*tp_clear*/
7565 0, /*tp_richcompare*/
7566 0, /*tp_weaklistoffset*/
7567 0, /*tp_iter*/
7568 0, /*tp_iternext*/
7569 encoding_map_methods, /*tp_methods*/
7570 0, /*tp_members*/
7571 0, /*tp_getset*/
7572 0, /*tp_base*/
7573 0, /*tp_dict*/
7574 0, /*tp_descr_get*/
7575 0, /*tp_descr_set*/
7576 0, /*tp_dictoffset*/
7577 0, /*tp_init*/
7578 0, /*tp_alloc*/
7579 0, /*tp_new*/
7580 0, /*tp_free*/
7581 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007582};
7583
7584PyObject*
7585PyUnicode_BuildEncodingMap(PyObject* string)
7586{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007587 PyObject *result;
7588 struct encoding_map *mresult;
7589 int i;
7590 int need_dict = 0;
7591 unsigned char level1[32];
7592 unsigned char level2[512];
7593 unsigned char *mlevel1, *mlevel2, *mlevel3;
7594 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007595 int kind;
7596 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007597 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007598 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007599
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007600 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007601 PyErr_BadArgument();
7602 return NULL;
7603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007604 kind = PyUnicode_KIND(string);
7605 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007606 length = PyUnicode_GET_LENGTH(string);
7607 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007608 memset(level1, 0xFF, sizeof level1);
7609 memset(level2, 0xFF, sizeof level2);
7610
7611 /* If there isn't a one-to-one mapping of NULL to \0,
7612 or if there are non-BMP characters, we need to use
7613 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007614 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007615 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007616 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007617 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007618 ch = PyUnicode_READ(kind, data, i);
7619 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007620 need_dict = 1;
7621 break;
7622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007623 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007624 /* unmapped character */
7625 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007626 l1 = ch >> 11;
7627 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007628 if (level1[l1] == 0xFF)
7629 level1[l1] = count2++;
7630 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007631 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007632 }
7633
7634 if (count2 >= 0xFF || count3 >= 0xFF)
7635 need_dict = 1;
7636
7637 if (need_dict) {
7638 PyObject *result = PyDict_New();
7639 PyObject *key, *value;
7640 if (!result)
7641 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007642 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007643 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007644 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007645 if (!key || !value)
7646 goto failed1;
7647 if (PyDict_SetItem(result, key, value) == -1)
7648 goto failed1;
7649 Py_DECREF(key);
7650 Py_DECREF(value);
7651 }
7652 return result;
7653 failed1:
7654 Py_XDECREF(key);
7655 Py_XDECREF(value);
7656 Py_DECREF(result);
7657 return NULL;
7658 }
7659
7660 /* Create a three-level trie */
7661 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7662 16*count2 + 128*count3 - 1);
7663 if (!result)
7664 return PyErr_NoMemory();
7665 PyObject_Init(result, &EncodingMapType);
7666 mresult = (struct encoding_map*)result;
7667 mresult->count2 = count2;
7668 mresult->count3 = count3;
7669 mlevel1 = mresult->level1;
7670 mlevel2 = mresult->level23;
7671 mlevel3 = mresult->level23 + 16*count2;
7672 memcpy(mlevel1, level1, 32);
7673 memset(mlevel2, 0xFF, 16*count2);
7674 memset(mlevel3, 0, 128*count3);
7675 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007676 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007677 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007678 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7679 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007680 /* unmapped character */
7681 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007682 o1 = ch>>11;
7683 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007684 i2 = 16*mlevel1[o1] + o2;
7685 if (mlevel2[i2] == 0xFF)
7686 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007687 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007688 i3 = 128*mlevel2[i2] + o3;
7689 mlevel3[i3] = i;
7690 }
7691 return result;
7692}
7693
7694static int
Victor Stinner22168992011-11-20 17:09:18 +01007695encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007696{
7697 struct encoding_map *map = (struct encoding_map*)mapping;
7698 int l1 = c>>11;
7699 int l2 = (c>>7) & 0xF;
7700 int l3 = c & 0x7F;
7701 int i;
7702
Victor Stinner22168992011-11-20 17:09:18 +01007703 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007705 if (c == 0)
7706 return 0;
7707 /* level 1*/
7708 i = map->level1[l1];
7709 if (i == 0xFF) {
7710 return -1;
7711 }
7712 /* level 2*/
7713 i = map->level23[16*i+l2];
7714 if (i == 0xFF) {
7715 return -1;
7716 }
7717 /* level 3 */
7718 i = map->level23[16*map->count2 + 128*i + l3];
7719 if (i == 0) {
7720 return -1;
7721 }
7722 return i;
7723}
7724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725/* Lookup the character ch in the mapping. If the character
7726 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007727 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007728static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007729charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730{
Christian Heimes217cfd12007-12-02 14:31:20 +00007731 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007732 PyObject *x;
7733
7734 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007736 x = PyObject_GetItem(mapping, w);
7737 Py_DECREF(w);
7738 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7740 /* No mapping found means: mapping is undefined. */
7741 PyErr_Clear();
7742 x = Py_None;
7743 Py_INCREF(x);
7744 return x;
7745 } else
7746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007748 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007750 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 long value = PyLong_AS_LONG(x);
7752 if (value < 0 || value > 255) {
7753 PyErr_SetString(PyExc_TypeError,
7754 "character mapping must be in range(256)");
7755 Py_DECREF(x);
7756 return NULL;
7757 }
7758 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007760 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 /* wrong return value */
7764 PyErr_Format(PyExc_TypeError,
7765 "character mapping must return integer, bytes or None, not %.400s",
7766 x->ob_type->tp_name);
7767 Py_DECREF(x);
7768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 }
7770}
7771
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007772static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007773charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007775 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7776 /* exponentially overallocate to minimize reallocations */
7777 if (requiredsize < 2*outsize)
7778 requiredsize = 2*outsize;
7779 if (_PyBytes_Resize(outobj, requiredsize))
7780 return -1;
7781 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007782}
7783
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007786} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007787/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007788 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789 space is available. Return a new reference to the object that
7790 was put in the output buffer, or Py_None, if the mapping was undefined
7791 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007792 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007793static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007794charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007795 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007797 PyObject *rep;
7798 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007799 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007800
Christian Heimes90aa7642007-12-19 02:45:37 +00007801 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007802 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007804 if (res == -1)
7805 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 if (outsize<requiredsize)
7807 if (charmapencode_resize(outobj, outpos, requiredsize))
7808 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007809 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 outstart[(*outpos)++] = (char)res;
7811 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812 }
7813
7814 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007817 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 Py_DECREF(rep);
7819 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 if (PyLong_Check(rep)) {
7822 Py_ssize_t requiredsize = *outpos+1;
7823 if (outsize<requiredsize)
7824 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7825 Py_DECREF(rep);
7826 return enc_EXCEPTION;
7827 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007828 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007830 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 else {
7832 const char *repchars = PyBytes_AS_STRING(rep);
7833 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7834 Py_ssize_t requiredsize = *outpos+repsize;
7835 if (outsize<requiredsize)
7836 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7837 Py_DECREF(rep);
7838 return enc_EXCEPTION;
7839 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007840 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 memcpy(outstart + *outpos, repchars, repsize);
7842 *outpos += repsize;
7843 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007844 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845 Py_DECREF(rep);
7846 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007847}
7848
7849/* handle an error in PyUnicode_EncodeCharmap
7850 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007851static int
7852charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007853 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007854 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007855 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007856 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007857{
7858 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007859 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007860 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007861 enum PyUnicode_Kind kind;
7862 void *data;
7863 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007864 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007865 Py_ssize_t collstartpos = *inpos;
7866 Py_ssize_t collendpos = *inpos+1;
7867 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868 char *encoding = "charmap";
7869 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007870 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007871 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007872 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007873
Benjamin Petersonbac79492012-01-14 13:34:47 -05007874 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007875 return -1;
7876 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007877 /* find all unencodable characters */
7878 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007880 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007881 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007882 val = encoding_map_lookup(ch, mapping);
7883 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 break;
7885 ++collendpos;
7886 continue;
7887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007889 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7890 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 if (rep==NULL)
7892 return -1;
7893 else if (rep!=Py_None) {
7894 Py_DECREF(rep);
7895 break;
7896 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007897 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007899 }
7900 /* cache callback name lookup
7901 * (if not done yet, i.e. it's the first error) */
7902 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 if ((errors==NULL) || (!strcmp(errors, "strict")))
7904 *known_errorHandler = 1;
7905 else if (!strcmp(errors, "replace"))
7906 *known_errorHandler = 2;
7907 else if (!strcmp(errors, "ignore"))
7908 *known_errorHandler = 3;
7909 else if (!strcmp(errors, "xmlcharrefreplace"))
7910 *known_errorHandler = 4;
7911 else
7912 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007913 }
7914 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007915 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007916 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007917 return -1;
7918 case 2: /* replace */
7919 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 x = charmapencode_output('?', mapping, res, respos);
7921 if (x==enc_EXCEPTION) {
7922 return -1;
7923 }
7924 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007925 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 return -1;
7927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007928 }
7929 /* fall through */
7930 case 3: /* ignore */
7931 *inpos = collendpos;
7932 break;
7933 case 4: /* xmlcharrefreplace */
7934 /* generate replacement (temporarily (mis)uses p) */
7935 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 char buffer[2+29+1+1];
7937 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007938 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 for (cp = buffer; *cp; ++cp) {
7940 x = charmapencode_output(*cp, mapping, res, respos);
7941 if (x==enc_EXCEPTION)
7942 return -1;
7943 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007944 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 return -1;
7946 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007947 }
7948 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007949 *inpos = collendpos;
7950 break;
7951 default:
7952 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007953 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007955 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007957 if (PyBytes_Check(repunicode)) {
7958 /* Directly copy bytes result to output. */
7959 Py_ssize_t outsize = PyBytes_Size(*res);
7960 Py_ssize_t requiredsize;
7961 repsize = PyBytes_Size(repunicode);
7962 requiredsize = *respos + repsize;
7963 if (requiredsize > outsize)
7964 /* Make room for all additional bytes. */
7965 if (charmapencode_resize(res, respos, requiredsize)) {
7966 Py_DECREF(repunicode);
7967 return -1;
7968 }
7969 memcpy(PyBytes_AsString(*res) + *respos,
7970 PyBytes_AsString(repunicode), repsize);
7971 *respos += repsize;
7972 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007973 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007974 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007975 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007976 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007977 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007978 Py_DECREF(repunicode);
7979 return -1;
7980 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007981 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007982 data = PyUnicode_DATA(repunicode);
7983 kind = PyUnicode_KIND(repunicode);
7984 for (index = 0; index < repsize; index++) {
7985 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7986 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007988 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 return -1;
7990 }
7991 else if (x==enc_FAILED) {
7992 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007993 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 return -1;
7995 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007996 }
7997 *inpos = newpos;
7998 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 }
8000 return 0;
8001}
8002
Alexander Belopolsky40018472011-02-26 01:02:56 +00008003PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008004_PyUnicode_EncodeCharmap(PyObject *unicode,
8005 PyObject *mapping,
8006 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008 /* output object */
8009 PyObject *res = NULL;
8010 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008011 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008012 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008014 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008015 PyObject *errorHandler = NULL;
8016 PyObject *exc = NULL;
8017 /* the following variable is used for caching string comparisons
8018 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8019 * 3=ignore, 4=xmlcharrefreplace */
8020 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008021 void *data;
8022 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023
Benjamin Petersonbac79492012-01-14 13:34:47 -05008024 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008025 return NULL;
8026 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008027 data = PyUnicode_DATA(unicode);
8028 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008029
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 /* Default to Latin-1 */
8031 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008032 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008034 /* allocate enough for a simple encoding without
8035 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008036 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008037 if (res == NULL)
8038 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008039 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008042 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008043 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008045 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 if (x==enc_EXCEPTION) /* error */
8047 goto onError;
8048 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008049 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 &exc,
8051 &known_errorHandler, &errorHandler, errors,
8052 &res, &respos)) {
8053 goto onError;
8054 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008055 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 else
8057 /* done with this character => adjust input position */
8058 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008061 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008062 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008063 if (_PyBytes_Resize(&res, respos) < 0)
8064 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008066 Py_XDECREF(exc);
8067 Py_XDECREF(errorHandler);
8068 return res;
8069
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008071 Py_XDECREF(res);
8072 Py_XDECREF(exc);
8073 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074 return NULL;
8075}
8076
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008077/* Deprecated */
8078PyObject *
8079PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8080 Py_ssize_t size,
8081 PyObject *mapping,
8082 const char *errors)
8083{
8084 PyObject *result;
8085 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8086 if (unicode == NULL)
8087 return NULL;
8088 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8089 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008090 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091}
8092
Alexander Belopolsky40018472011-02-26 01:02:56 +00008093PyObject *
8094PyUnicode_AsCharmapString(PyObject *unicode,
8095 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
8097 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 PyErr_BadArgument();
8099 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102}
8103
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008105static void
8106make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008108 Py_ssize_t startpos, Py_ssize_t endpos,
8109 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 *exceptionObject = _PyUnicodeTranslateError_Create(
8113 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 }
8115 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8117 goto onError;
8118 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8119 goto onError;
8120 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8121 goto onError;
8122 return;
8123 onError:
8124 Py_DECREF(*exceptionObject);
8125 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 }
8127}
8128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129/* error handling callback helper:
8130 build arguments, call the callback and check the arguments,
8131 put the result into newpos and return the replacement string, which
8132 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008133static PyObject *
8134unicode_translate_call_errorhandler(const char *errors,
8135 PyObject **errorHandler,
8136 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008138 Py_ssize_t startpos, Py_ssize_t endpos,
8139 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008141 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008143 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144 PyObject *restuple;
8145 PyObject *resunicode;
8146
8147 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008149 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 }
8152
8153 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008154 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008155 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008157
8158 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008160 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008163 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 Py_DECREF(restuple);
8165 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008166 }
8167 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 &resunicode, &i_newpos)) {
8169 Py_DECREF(restuple);
8170 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008171 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008172 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008173 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008174 else
8175 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8178 Py_DECREF(restuple);
8179 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008180 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008181 Py_INCREF(resunicode);
8182 Py_DECREF(restuple);
8183 return resunicode;
8184}
8185
8186/* Lookup the character ch in the mapping and put the result in result,
8187 which must be decrefed by the caller.
8188 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008189static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008190charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191{
Christian Heimes217cfd12007-12-02 14:31:20 +00008192 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193 PyObject *x;
8194
8195 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008197 x = PyObject_GetItem(mapping, w);
8198 Py_DECREF(w);
8199 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8201 /* No mapping found means: use 1:1 mapping. */
8202 PyErr_Clear();
8203 *result = NULL;
8204 return 0;
8205 } else
8206 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207 }
8208 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 *result = x;
8210 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008212 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 long value = PyLong_AS_LONG(x);
8214 long max = PyUnicode_GetMax();
8215 if (value < 0 || value > max) {
8216 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008217 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 Py_DECREF(x);
8219 return -1;
8220 }
8221 *result = x;
8222 return 0;
8223 }
8224 else if (PyUnicode_Check(x)) {
8225 *result = x;
8226 return 0;
8227 }
8228 else {
8229 /* wrong return value */
8230 PyErr_SetString(PyExc_TypeError,
8231 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 Py_DECREF(x);
8233 return -1;
8234 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235}
8236/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 if not reallocate and adjust various state variables.
8238 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008239static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008244 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008245 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 /* exponentially overallocate to minimize reallocations */
8247 if (requiredsize < 2 * oldsize)
8248 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008249 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8250 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008252 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 }
8255 return 0;
8256}
8257/* lookup the character, put the result in the output string and adjust
8258 various state variables. Return a new reference to the object that
8259 was put in the output buffer in *result, or Py_None, if the mapping was
8260 undefined (in which case no character was written).
8261 The called must decref result.
8262 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008263static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008264charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8265 PyObject *mapping, Py_UCS4 **output,
8266 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008267 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008269 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8270 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 }
8276 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008278 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008280 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008281 }
8282 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283 Py_ssize_t repsize;
8284 if (PyUnicode_READY(*res) == -1)
8285 return -1;
8286 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 if (repsize==1) {
8288 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 }
8291 else if (repsize!=0) {
8292 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 Py_ssize_t requiredsize = *opos +
8294 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296 Py_ssize_t i;
8297 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 for(i = 0; i < repsize; i++)
8300 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 }
8303 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 return 0;
8306}
8307
Alexander Belopolsky40018472011-02-26 01:02:56 +00008308PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309_PyUnicode_TranslateCharmap(PyObject *input,
8310 PyObject *mapping,
8311 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 /* input object */
8314 char *idata;
8315 Py_ssize_t size, i;
8316 int kind;
8317 /* output buffer */
8318 Py_UCS4 *output = NULL;
8319 Py_ssize_t osize;
8320 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008322 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323 char *reason = "character maps to <undefined>";
8324 PyObject *errorHandler = NULL;
8325 PyObject *exc = NULL;
8326 /* the following variable is used for caching string comparisons
8327 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8328 * 3=ignore, 4=xmlcharrefreplace */
8329 int known_errorHandler = -1;
8330
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 PyErr_BadArgument();
8333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008336 if (PyUnicode_READY(input) == -1)
8337 return NULL;
8338 idata = (char*)PyUnicode_DATA(input);
8339 kind = PyUnicode_KIND(input);
8340 size = PyUnicode_GET_LENGTH(input);
8341 i = 0;
8342
8343 if (size == 0) {
8344 Py_INCREF(input);
8345 return input;
8346 }
8347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 /* allocate enough for a simple 1:1 translation without
8349 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 osize = size;
8351 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8352 opos = 0;
8353 if (output == NULL) {
8354 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 /* try to encode it */
8360 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 if (charmaptranslate_output(input, i, mapping,
8362 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 Py_XDECREF(x);
8364 goto onError;
8365 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008366 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 else { /* untranslatable character */
8370 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8371 Py_ssize_t repsize;
8372 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 Py_ssize_t collstart = i;
8376 Py_ssize_t collend = i+1;
8377 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 while (collend < size) {
8381 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 goto onError;
8383 Py_XDECREF(x);
8384 if (x!=Py_None)
8385 break;
8386 ++collend;
8387 }
8388 /* cache callback name lookup
8389 * (if not done yet, i.e. it's the first error) */
8390 if (known_errorHandler==-1) {
8391 if ((errors==NULL) || (!strcmp(errors, "strict")))
8392 known_errorHandler = 1;
8393 else if (!strcmp(errors, "replace"))
8394 known_errorHandler = 2;
8395 else if (!strcmp(errors, "ignore"))
8396 known_errorHandler = 3;
8397 else if (!strcmp(errors, "xmlcharrefreplace"))
8398 known_errorHandler = 4;
8399 else
8400 known_errorHandler = 0;
8401 }
8402 switch (known_errorHandler) {
8403 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008404 make_translate_exception(&exc,
8405 input, collstart, collend, reason);
8406 if (exc != NULL)
8407 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 case 2: /* replace */
8410 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 for (coll = collstart; coll<collend; coll++)
8412 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 /* fall through */
8414 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 break;
8417 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418 /* generate replacement (temporarily (mis)uses i) */
8419 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 char buffer[2+29+1+1];
8421 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8423 if (charmaptranslate_makespace(&output, &osize,
8424 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 goto onError;
8426 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 break;
8431 default:
8432 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 reason, input, &exc,
8434 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008435 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008437 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008438 Py_DECREF(repunicode);
8439 goto onError;
8440 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008442 repsize = PyUnicode_GET_LENGTH(repunicode);
8443 if (charmaptranslate_makespace(&output, &osize,
8444 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 Py_DECREF(repunicode);
8446 goto onError;
8447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 for (uni2 = 0; repsize-->0; ++uni2)
8449 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8450 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 }
8454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8456 if (!res)
8457 goto onError;
8458 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 Py_XDECREF(exc);
8460 Py_XDECREF(errorHandler);
8461 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465 Py_XDECREF(exc);
8466 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 return NULL;
8468}
8469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470/* Deprecated. Use PyUnicode_Translate instead. */
8471PyObject *
8472PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8473 Py_ssize_t size,
8474 PyObject *mapping,
8475 const char *errors)
8476{
Christian Heimes5f520f42012-09-11 14:03:25 +02008477 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8479 if (!unicode)
8480 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008481 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8482 Py_DECREF(unicode);
8483 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484}
8485
Alexander Belopolsky40018472011-02-26 01:02:56 +00008486PyObject *
8487PyUnicode_Translate(PyObject *str,
8488 PyObject *mapping,
8489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490{
8491 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008492
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493 str = PyUnicode_FromObject(str);
8494 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008495 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 Py_DECREF(str);
8498 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499}
Tim Petersced69f82003-09-16 20:30:58 +00008500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008502fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503{
8504 /* No need to call PyUnicode_READY(self) because this function is only
8505 called as a callback from fixup() which does it already. */
8506 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8507 const int kind = PyUnicode_KIND(self);
8508 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008509 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008510 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 Py_ssize_t i;
8512
8513 for (i = 0; i < len; ++i) {
8514 ch = PyUnicode_READ(kind, data, i);
8515 fixed = 0;
8516 if (ch > 127) {
8517 if (Py_UNICODE_ISSPACE(ch))
8518 fixed = ' ';
8519 else {
8520 const int decimal = Py_UNICODE_TODECIMAL(ch);
8521 if (decimal >= 0)
8522 fixed = '0' + decimal;
8523 }
8524 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008525 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008526 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 PyUnicode_WRITE(kind, data, i, fixed);
8528 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008529 else
8530 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 }
8533
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008534 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535}
8536
8537PyObject *
8538_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8539{
8540 if (!PyUnicode_Check(unicode)) {
8541 PyErr_BadInternalCall();
8542 return NULL;
8543 }
8544 if (PyUnicode_READY(unicode) == -1)
8545 return NULL;
8546 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8547 /* If the string is already ASCII, just return the same string */
8548 Py_INCREF(unicode);
8549 return unicode;
8550 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008551 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008552}
8553
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008554PyObject *
8555PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8556 Py_ssize_t length)
8557{
Victor Stinnerf0124502011-11-21 23:12:56 +01008558 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008559 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008560 Py_UCS4 maxchar;
8561 enum PyUnicode_Kind kind;
8562 void *data;
8563
Victor Stinner99d7ad02012-02-22 13:37:39 +01008564 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008565 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008566 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008567 if (ch > 127) {
8568 int decimal = Py_UNICODE_TODECIMAL(ch);
8569 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008570 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008571 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008572 }
8573 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008574
8575 /* Copy to a new string */
8576 decimal = PyUnicode_New(length, maxchar);
8577 if (decimal == NULL)
8578 return decimal;
8579 kind = PyUnicode_KIND(decimal);
8580 data = PyUnicode_DATA(decimal);
8581 /* Iterate over code points */
8582 for (i = 0; i < length; i++) {
8583 Py_UNICODE ch = s[i];
8584 if (ch > 127) {
8585 int decimal = Py_UNICODE_TODECIMAL(ch);
8586 if (decimal >= 0)
8587 ch = '0' + decimal;
8588 }
8589 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008591 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008592}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008593/* --- Decimal Encoder ---------------------------------------------------- */
8594
Alexander Belopolsky40018472011-02-26 01:02:56 +00008595int
8596PyUnicode_EncodeDecimal(Py_UNICODE *s,
8597 Py_ssize_t length,
8598 char *output,
8599 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008600{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008602 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008603 enum PyUnicode_Kind kind;
8604 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008605
8606 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 PyErr_BadArgument();
8608 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008609 }
8610
Victor Stinner42bf7752011-11-21 22:52:58 +01008611 unicode = PyUnicode_FromUnicode(s, length);
8612 if (unicode == NULL)
8613 return -1;
8614
Benjamin Petersonbac79492012-01-14 13:34:47 -05008615 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008616 Py_DECREF(unicode);
8617 return -1;
8618 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008619 kind = PyUnicode_KIND(unicode);
8620 data = PyUnicode_DATA(unicode);
8621
Victor Stinnerb84d7232011-11-22 01:50:07 +01008622 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008623 PyObject *exc;
8624 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008626 Py_ssize_t startpos;
8627
8628 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008629
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008631 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008632 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008634 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 decimal = Py_UNICODE_TODECIMAL(ch);
8636 if (decimal >= 0) {
8637 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008638 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 continue;
8640 }
8641 if (0 < ch && ch < 256) {
8642 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008643 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 continue;
8645 }
Victor Stinner6345be92011-11-25 20:09:01 +01008646
Victor Stinner42bf7752011-11-21 22:52:58 +01008647 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008648 exc = NULL;
8649 raise_encode_exception(&exc, "decimal", unicode,
8650 startpos, startpos+1,
8651 "invalid decimal Unicode string");
8652 Py_XDECREF(exc);
8653 Py_DECREF(unicode);
8654 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008655 }
8656 /* 0-terminate the output string */
8657 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008658 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008659 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008660}
8661
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662/* --- Helpers ------------------------------------------------------------ */
8663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008665any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 Py_ssize_t start,
8667 Py_ssize_t end)
8668{
8669 int kind1, kind2, kind;
8670 void *buf1, *buf2;
8671 Py_ssize_t len1, len2, result;
8672
8673 kind1 = PyUnicode_KIND(s1);
8674 kind2 = PyUnicode_KIND(s2);
8675 kind = kind1 > kind2 ? kind1 : kind2;
8676 buf1 = PyUnicode_DATA(s1);
8677 buf2 = PyUnicode_DATA(s2);
8678 if (kind1 != kind)
8679 buf1 = _PyUnicode_AsKind(s1, kind);
8680 if (!buf1)
8681 return -2;
8682 if (kind2 != kind)
8683 buf2 = _PyUnicode_AsKind(s2, kind);
8684 if (!buf2) {
8685 if (kind1 != kind) PyMem_Free(buf1);
8686 return -2;
8687 }
8688 len1 = PyUnicode_GET_LENGTH(s1);
8689 len2 = PyUnicode_GET_LENGTH(s2);
8690
Victor Stinner794d5672011-10-10 03:21:36 +02008691 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008692 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008693 case PyUnicode_1BYTE_KIND:
8694 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8695 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8696 else
8697 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8698 break;
8699 case PyUnicode_2BYTE_KIND:
8700 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8701 break;
8702 case PyUnicode_4BYTE_KIND:
8703 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8704 break;
8705 default:
8706 assert(0); result = -2;
8707 }
8708 }
8709 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008710 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008711 case PyUnicode_1BYTE_KIND:
8712 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8713 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8714 else
8715 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8716 break;
8717 case PyUnicode_2BYTE_KIND:
8718 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8719 break;
8720 case PyUnicode_4BYTE_KIND:
8721 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8722 break;
8723 default:
8724 assert(0); result = -2;
8725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 }
8727
8728 if (kind1 != kind)
8729 PyMem_Free(buf1);
8730 if (kind2 != kind)
8731 PyMem_Free(buf2);
8732
8733 return result;
8734}
8735
8736Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008737_PyUnicode_InsertThousandsGrouping(
8738 PyObject *unicode, Py_ssize_t index,
8739 Py_ssize_t n_buffer,
8740 void *digits, Py_ssize_t n_digits,
8741 Py_ssize_t min_width,
8742 const char *grouping, PyObject *thousands_sep,
8743 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744{
Victor Stinner41a863c2012-02-24 00:37:51 +01008745 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008746 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008747 Py_ssize_t thousands_sep_len;
8748 Py_ssize_t len;
8749
8750 if (unicode != NULL) {
8751 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008752 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008753 }
8754 else {
8755 kind = PyUnicode_1BYTE_KIND;
8756 data = NULL;
8757 }
8758 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8759 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8760 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8761 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008762 if (thousands_sep_kind < kind) {
8763 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8764 if (!thousands_sep_data)
8765 return -1;
8766 }
8767 else {
8768 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8769 if (!data)
8770 return -1;
8771 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008772 }
8773
Benjamin Petersonead6b532011-12-20 17:23:42 -06008774 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008776 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008777 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008778 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008779 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008780 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008781 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008782 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008783 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008784 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008785 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008786 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008788 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008789 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008790 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008791 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008792 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008794 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008795 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008796 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008797 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008798 break;
8799 default:
8800 assert(0);
8801 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008803 if (unicode != NULL && thousands_sep_kind != kind) {
8804 if (thousands_sep_kind < kind)
8805 PyMem_Free(thousands_sep_data);
8806 else
8807 PyMem_Free(data);
8808 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008809 if (unicode == NULL) {
8810 *maxchar = 127;
8811 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008812 *maxchar = MAX_MAXCHAR(*maxchar,
8813 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008814 }
8815 }
8816 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817}
8818
8819
Thomas Wouters477c8d52006-05-27 19:21:47 +00008820/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008821#define ADJUST_INDICES(start, end, len) \
8822 if (end > len) \
8823 end = len; \
8824 else if (end < 0) { \
8825 end += len; \
8826 if (end < 0) \
8827 end = 0; \
8828 } \
8829 if (start < 0) { \
8830 start += len; \
8831 if (start < 0) \
8832 start = 0; \
8833 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008834
Alexander Belopolsky40018472011-02-26 01:02:56 +00008835Py_ssize_t
8836PyUnicode_Count(PyObject *str,
8837 PyObject *substr,
8838 Py_ssize_t start,
8839 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008841 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008842 PyObject* str_obj;
8843 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 int kind1, kind2, kind;
8845 void *buf1 = NULL, *buf2 = NULL;
8846 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008847
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008848 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008849 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008851 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008852 if (!sub_obj) {
8853 Py_DECREF(str_obj);
8854 return -1;
8855 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008856 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008857 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 Py_DECREF(str_obj);
8859 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860 }
Tim Petersced69f82003-09-16 20:30:58 +00008861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 kind1 = PyUnicode_KIND(str_obj);
8863 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008864 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008865 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008867 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008868 if (kind2 > kind) {
8869 Py_DECREF(sub_obj);
8870 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008871 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008872 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008873 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 if (!buf2)
8876 goto onError;
8877 len1 = PyUnicode_GET_LENGTH(str_obj);
8878 len2 = PyUnicode_GET_LENGTH(sub_obj);
8879
8880 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008881 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008883 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8884 result = asciilib_count(
8885 ((Py_UCS1*)buf1) + start, end - start,
8886 buf2, len2, PY_SSIZE_T_MAX
8887 );
8888 else
8889 result = ucs1lib_count(
8890 ((Py_UCS1*)buf1) + start, end - start,
8891 buf2, len2, PY_SSIZE_T_MAX
8892 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 break;
8894 case PyUnicode_2BYTE_KIND:
8895 result = ucs2lib_count(
8896 ((Py_UCS2*)buf1) + start, end - start,
8897 buf2, len2, PY_SSIZE_T_MAX
8898 );
8899 break;
8900 case PyUnicode_4BYTE_KIND:
8901 result = ucs4lib_count(
8902 ((Py_UCS4*)buf1) + start, end - start,
8903 buf2, len2, PY_SSIZE_T_MAX
8904 );
8905 break;
8906 default:
8907 assert(0); result = 0;
8908 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008909
8910 Py_DECREF(sub_obj);
8911 Py_DECREF(str_obj);
8912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 if (kind2 != kind)
8914 PyMem_Free(buf2);
8915
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 onError:
8918 Py_DECREF(sub_obj);
8919 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 if (kind2 != kind && buf2)
8921 PyMem_Free(buf2);
8922 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923}
8924
Alexander Belopolsky40018472011-02-26 01:02:56 +00008925Py_ssize_t
8926PyUnicode_Find(PyObject *str,
8927 PyObject *sub,
8928 Py_ssize_t start,
8929 Py_ssize_t end,
8930 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008932 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008933
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008935 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008937 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008938 if (!sub) {
8939 Py_DECREF(str);
8940 return -2;
8941 }
8942 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8943 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 Py_DECREF(str);
8945 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 }
Tim Petersced69f82003-09-16 20:30:58 +00008947
Victor Stinner794d5672011-10-10 03:21:36 +02008948 result = any_find_slice(direction,
8949 str, sub, start, end
8950 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008951
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008953 Py_DECREF(sub);
8954
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 return result;
8956}
8957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958Py_ssize_t
8959PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8960 Py_ssize_t start, Py_ssize_t end,
8961 int direction)
8962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008964 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 if (PyUnicode_READY(str) == -1)
8966 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008967 if (start < 0 || end < 0) {
8968 PyErr_SetString(PyExc_IndexError, "string index out of range");
8969 return -2;
8970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 if (end > PyUnicode_GET_LENGTH(str))
8972 end = PyUnicode_GET_LENGTH(str);
8973 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008974 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8975 kind, end-start, ch, direction);
8976 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008978 else
8979 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980}
8981
Alexander Belopolsky40018472011-02-26 01:02:56 +00008982static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008983tailmatch(PyObject *self,
8984 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008985 Py_ssize_t start,
8986 Py_ssize_t end,
8987 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 int kind_self;
8990 int kind_sub;
8991 void *data_self;
8992 void *data_sub;
8993 Py_ssize_t offset;
8994 Py_ssize_t i;
8995 Py_ssize_t end_sub;
8996
8997 if (PyUnicode_READY(self) == -1 ||
8998 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01008999 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000
9001 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 return 1;
9003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9005 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 kind_self = PyUnicode_KIND(self);
9010 data_self = PyUnicode_DATA(self);
9011 kind_sub = PyUnicode_KIND(substring);
9012 data_sub = PyUnicode_DATA(substring);
9013 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9014
9015 if (direction > 0)
9016 offset = end;
9017 else
9018 offset = start;
9019
9020 if (PyUnicode_READ(kind_self, data_self, offset) ==
9021 PyUnicode_READ(kind_sub, data_sub, 0) &&
9022 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9023 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9024 /* If both are of the same kind, memcmp is sufficient */
9025 if (kind_self == kind_sub) {
9026 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009027 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 data_sub,
9029 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009030 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 }
9032 /* otherwise we have to compare each character by first accesing it */
9033 else {
9034 /* We do not need to compare 0 and len(substring)-1 because
9035 the if statement above ensured already that they are equal
9036 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009037 for (i = 1; i < end_sub; ++i) {
9038 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9039 PyUnicode_READ(kind_sub, data_sub, i))
9040 return 0;
9041 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 }
9045
9046 return 0;
9047}
9048
Alexander Belopolsky40018472011-02-26 01:02:56 +00009049Py_ssize_t
9050PyUnicode_Tailmatch(PyObject *str,
9051 PyObject *substr,
9052 Py_ssize_t start,
9053 Py_ssize_t end,
9054 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009056 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009057
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058 str = PyUnicode_FromObject(str);
9059 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061 substr = PyUnicode_FromObject(substr);
9062 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 Py_DECREF(str);
9064 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 }
Tim Petersced69f82003-09-16 20:30:58 +00009066
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009067 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 Py_DECREF(str);
9070 Py_DECREF(substr);
9071 return result;
9072}
9073
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074/* Apply fixfct filter to the Unicode object self and return a
9075 reference to the modified object */
9076
Alexander Belopolsky40018472011-02-26 01:02:56 +00009077static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009078fixup(PyObject *self,
9079 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 PyObject *u;
9082 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009083 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009085 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009088 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 /* fix functions return the new maximum character in a string,
9091 if the kind of the resulting unicode object does not change,
9092 everything is fine. Otherwise we need to change the string kind
9093 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009094 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009095
9096 if (maxchar_new == 0) {
9097 /* no changes */;
9098 if (PyUnicode_CheckExact(self)) {
9099 Py_DECREF(u);
9100 Py_INCREF(self);
9101 return self;
9102 }
9103 else
9104 return u;
9105 }
9106
Victor Stinnere6abb482012-05-02 01:15:40 +02009107 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108
Victor Stinnereaab6042011-12-11 22:22:39 +01009109 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009111
9112 /* In case the maximum character changed, we need to
9113 convert the string to the new category. */
9114 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9115 if (v == NULL) {
9116 Py_DECREF(u);
9117 return NULL;
9118 }
9119 if (maxchar_new > maxchar_old) {
9120 /* If the maxchar increased so that the kind changed, not all
9121 characters are representable anymore and we need to fix the
9122 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009123 _PyUnicode_FastCopyCharacters(v, 0,
9124 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009125 maxchar_old = fixfct(v);
9126 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 }
9128 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009129 _PyUnicode_FastCopyCharacters(v, 0,
9130 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009132 Py_DECREF(u);
9133 assert(_PyUnicode_CheckConsistency(v, 1));
9134 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135}
9136
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009137static PyObject *
9138ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009140 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9141 char *resdata, *data = PyUnicode_DATA(self);
9142 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009143
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009144 res = PyUnicode_New(len, 127);
9145 if (res == NULL)
9146 return NULL;
9147 resdata = PyUnicode_DATA(res);
9148 if (lower)
9149 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009151 _Py_bytes_upper(resdata, data, len);
9152 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153}
9154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009156handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009158 Py_ssize_t j;
9159 int final_sigma;
9160 Py_UCS4 c;
9161 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009162
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009163 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9164
9165 where ! is a negation and \p{xxx} is a character with property xxx.
9166 */
9167 for (j = i - 1; j >= 0; j--) {
9168 c = PyUnicode_READ(kind, data, j);
9169 if (!_PyUnicode_IsCaseIgnorable(c))
9170 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009172 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9173 if (final_sigma) {
9174 for (j = i + 1; j < length; j++) {
9175 c = PyUnicode_READ(kind, data, j);
9176 if (!_PyUnicode_IsCaseIgnorable(c))
9177 break;
9178 }
9179 final_sigma = j == length || !_PyUnicode_IsCased(c);
9180 }
9181 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182}
9183
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009184static int
9185lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9186 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009188 /* Obscure special case. */
9189 if (c == 0x3A3) {
9190 mapped[0] = handle_capital_sigma(kind, data, length, i);
9191 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009193 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194}
9195
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009196static Py_ssize_t
9197do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009199 Py_ssize_t i, k = 0;
9200 int n_res, j;
9201 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009202
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009203 c = PyUnicode_READ(kind, data, 0);
9204 n_res = _PyUnicode_ToUpperFull(c, mapped);
9205 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009206 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009207 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009209 for (i = 1; i < length; i++) {
9210 c = PyUnicode_READ(kind, data, i);
9211 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9212 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009213 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009214 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009215 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009216 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009217 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218}
9219
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009220static Py_ssize_t
9221do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9222 Py_ssize_t i, k = 0;
9223
9224 for (i = 0; i < length; i++) {
9225 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9226 int n_res, j;
9227 if (Py_UNICODE_ISUPPER(c)) {
9228 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9229 }
9230 else if (Py_UNICODE_ISLOWER(c)) {
9231 n_res = _PyUnicode_ToUpperFull(c, mapped);
9232 }
9233 else {
9234 n_res = 1;
9235 mapped[0] = c;
9236 }
9237 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009238 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009239 res[k++] = mapped[j];
9240 }
9241 }
9242 return k;
9243}
9244
9245static Py_ssize_t
9246do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9247 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009249 Py_ssize_t i, k = 0;
9250
9251 for (i = 0; i < length; i++) {
9252 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9253 int n_res, j;
9254 if (lower)
9255 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9256 else
9257 n_res = _PyUnicode_ToUpperFull(c, mapped);
9258 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009259 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009260 res[k++] = mapped[j];
9261 }
9262 }
9263 return k;
9264}
9265
9266static Py_ssize_t
9267do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9268{
9269 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9270}
9271
9272static Py_ssize_t
9273do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9274{
9275 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9276}
9277
Benjamin Petersone51757f2012-01-12 21:10:29 -05009278static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009279do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9280{
9281 Py_ssize_t i, k = 0;
9282
9283 for (i = 0; i < length; i++) {
9284 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9285 Py_UCS4 mapped[3];
9286 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9287 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009288 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009289 res[k++] = mapped[j];
9290 }
9291 }
9292 return k;
9293}
9294
9295static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009296do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9297{
9298 Py_ssize_t i, k = 0;
9299 int previous_is_cased;
9300
9301 previous_is_cased = 0;
9302 for (i = 0; i < length; i++) {
9303 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9304 Py_UCS4 mapped[3];
9305 int n_res, j;
9306
9307 if (previous_is_cased)
9308 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9309 else
9310 n_res = _PyUnicode_ToTitleFull(c, mapped);
9311
9312 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009313 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009314 res[k++] = mapped[j];
9315 }
9316
9317 previous_is_cased = _PyUnicode_IsCased(c);
9318 }
9319 return k;
9320}
9321
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009322static PyObject *
9323case_operation(PyObject *self,
9324 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9325{
9326 PyObject *res = NULL;
9327 Py_ssize_t length, newlength = 0;
9328 int kind, outkind;
9329 void *data, *outdata;
9330 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9331
Benjamin Petersoneea48462012-01-16 14:28:50 -05009332 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009333
9334 kind = PyUnicode_KIND(self);
9335 data = PyUnicode_DATA(self);
9336 length = PyUnicode_GET_LENGTH(self);
9337 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9338 if (tmp == NULL)
9339 return PyErr_NoMemory();
9340 newlength = perform(kind, data, length, tmp, &maxchar);
9341 res = PyUnicode_New(newlength, maxchar);
9342 if (res == NULL)
9343 goto leave;
9344 tmpend = tmp + newlength;
9345 outdata = PyUnicode_DATA(res);
9346 outkind = PyUnicode_KIND(res);
9347 switch (outkind) {
9348 case PyUnicode_1BYTE_KIND:
9349 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9350 break;
9351 case PyUnicode_2BYTE_KIND:
9352 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9353 break;
9354 case PyUnicode_4BYTE_KIND:
9355 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9356 break;
9357 default:
9358 assert(0);
9359 break;
9360 }
9361 leave:
9362 PyMem_FREE(tmp);
9363 return res;
9364}
9365
Tim Peters8ce9f162004-08-27 01:49:32 +00009366PyObject *
9367PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009370 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009372 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009373 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9374 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009375 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009377 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009379 int use_memcpy;
9380 unsigned char *res_data = NULL, *sep_data = NULL;
9381 PyObject *last_obj;
9382 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383
Tim Peters05eba1f2004-08-27 21:32:02 +00009384 fseq = PySequence_Fast(seq, "");
9385 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009386 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009387 }
9388
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009389 /* NOTE: the following code can't call back into Python code,
9390 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009391 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009392
Tim Peters05eba1f2004-08-27 21:32:02 +00009393 seqlen = PySequence_Fast_GET_SIZE(fseq);
9394 /* If empty sequence, return u"". */
9395 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009396 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009397 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009398 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009399
Tim Peters05eba1f2004-08-27 21:32:02 +00009400 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009401 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009402 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009403 if (seqlen == 1) {
9404 if (PyUnicode_CheckExact(items[0])) {
9405 res = items[0];
9406 Py_INCREF(res);
9407 Py_DECREF(fseq);
9408 return res;
9409 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009410 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009411 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009412 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009413 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009414 /* Set up sep and seplen */
9415 if (separator == NULL) {
9416 /* fall back to a blank space separator */
9417 sep = PyUnicode_FromOrdinal(' ');
9418 if (!sep)
9419 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009420 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009421 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009422 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009423 else {
9424 if (!PyUnicode_Check(separator)) {
9425 PyErr_Format(PyExc_TypeError,
9426 "separator: expected str instance,"
9427 " %.80s found",
9428 Py_TYPE(separator)->tp_name);
9429 goto onError;
9430 }
9431 if (PyUnicode_READY(separator))
9432 goto onError;
9433 sep = separator;
9434 seplen = PyUnicode_GET_LENGTH(separator);
9435 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9436 /* inc refcount to keep this code path symmetric with the
9437 above case of a blank separator */
9438 Py_INCREF(sep);
9439 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009440 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009441 }
9442
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009443 /* There are at least two things to join, or else we have a subclass
9444 * of str in the sequence.
9445 * Do a pre-pass to figure out the total amount of space we'll
9446 * need (sz), and see whether all argument are strings.
9447 */
9448 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009449#ifdef Py_DEBUG
9450 use_memcpy = 0;
9451#else
9452 use_memcpy = 1;
9453#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009454 for (i = 0; i < seqlen; i++) {
9455 const Py_ssize_t old_sz = sz;
9456 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 if (!PyUnicode_Check(item)) {
9458 PyErr_Format(PyExc_TypeError,
9459 "sequence item %zd: expected str instance,"
9460 " %.80s found",
9461 i, Py_TYPE(item)->tp_name);
9462 goto onError;
9463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 if (PyUnicode_READY(item) == -1)
9465 goto onError;
9466 sz += PyUnicode_GET_LENGTH(item);
9467 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009468 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009469 if (i != 0)
9470 sz += seplen;
9471 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9472 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009474 goto onError;
9475 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009476 if (use_memcpy && last_obj != NULL) {
9477 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9478 use_memcpy = 0;
9479 }
9480 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009481 }
Tim Petersced69f82003-09-16 20:30:58 +00009482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009484 if (res == NULL)
9485 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009486
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009487 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009488#ifdef Py_DEBUG
9489 use_memcpy = 0;
9490#else
9491 if (use_memcpy) {
9492 res_data = PyUnicode_1BYTE_DATA(res);
9493 kind = PyUnicode_KIND(res);
9494 if (seplen != 0)
9495 sep_data = PyUnicode_1BYTE_DATA(sep);
9496 }
9497#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009499 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009500 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009502 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009503 if (use_memcpy) {
9504 Py_MEMCPY(res_data,
9505 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009506 kind * seplen);
9507 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009508 }
9509 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009510 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009511 res_offset += seplen;
9512 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009514 itemlen = PyUnicode_GET_LENGTH(item);
9515 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009516 if (use_memcpy) {
9517 Py_MEMCPY(res_data,
9518 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009519 kind * itemlen);
9520 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009521 }
9522 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009523 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009524 res_offset += itemlen;
9525 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009526 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009527 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009528 if (use_memcpy)
9529 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009530 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009531 else
9532 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009533
Tim Peters05eba1f2004-08-27 21:32:02 +00009534 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009536 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009540 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009542 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 return NULL;
9544}
9545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546#define FILL(kind, data, value, start, length) \
9547 do { \
9548 Py_ssize_t i_ = 0; \
9549 assert(kind != PyUnicode_WCHAR_KIND); \
9550 switch ((kind)) { \
9551 case PyUnicode_1BYTE_KIND: { \
9552 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009553 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 break; \
9555 } \
9556 case PyUnicode_2BYTE_KIND: { \
9557 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9558 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9559 break; \
9560 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009561 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9563 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9564 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009565 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 } \
9567 } \
9568 } while (0)
9569
Victor Stinnerd3f08822012-05-29 12:57:52 +02009570void
9571_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9572 Py_UCS4 fill_char)
9573{
9574 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9575 const void *data = PyUnicode_DATA(unicode);
9576 assert(PyUnicode_IS_READY(unicode));
9577 assert(unicode_modifiable(unicode));
9578 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9579 assert(start >= 0);
9580 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9581 FILL(kind, data, fill_char, start, length);
9582}
9583
Victor Stinner3fe55312012-01-04 00:33:50 +01009584Py_ssize_t
9585PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9586 Py_UCS4 fill_char)
9587{
9588 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009589
9590 if (!PyUnicode_Check(unicode)) {
9591 PyErr_BadInternalCall();
9592 return -1;
9593 }
9594 if (PyUnicode_READY(unicode) == -1)
9595 return -1;
9596 if (unicode_check_modifiable(unicode))
9597 return -1;
9598
Victor Stinnerd3f08822012-05-29 12:57:52 +02009599 if (start < 0) {
9600 PyErr_SetString(PyExc_IndexError, "string index out of range");
9601 return -1;
9602 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009603 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9604 PyErr_SetString(PyExc_ValueError,
9605 "fill character is bigger than "
9606 "the string maximum character");
9607 return -1;
9608 }
9609
9610 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9611 length = Py_MIN(maxlen, length);
9612 if (length <= 0)
9613 return 0;
9614
Victor Stinnerd3f08822012-05-29 12:57:52 +02009615 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009616 return length;
9617}
9618
Victor Stinner9310abb2011-10-05 00:59:23 +02009619static PyObject *
9620pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009621 Py_ssize_t left,
9622 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 PyObject *u;
9626 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009627 int kind;
9628 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629
9630 if (left < 0)
9631 left = 0;
9632 if (right < 0)
9633 right = 0;
9634
Victor Stinnerc4b49542011-12-11 22:44:26 +01009635 if (left == 0 && right == 0)
9636 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9639 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009640 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9641 return NULL;
9642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009644 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009646 if (!u)
9647 return NULL;
9648
9649 kind = PyUnicode_KIND(u);
9650 data = PyUnicode_DATA(u);
9651 if (left)
9652 FILL(kind, data, fill, 0, left);
9653 if (right)
9654 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009655 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009656 assert(_PyUnicode_CheckConsistency(u, 1));
9657 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658}
9659
Alexander Belopolsky40018472011-02-26 01:02:56 +00009660PyObject *
9661PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664
9665 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009666 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009668 if (PyUnicode_READY(string) == -1) {
9669 Py_DECREF(string);
9670 return NULL;
9671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672
Benjamin Petersonead6b532011-12-20 17:23:42 -06009673 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009675 if (PyUnicode_IS_ASCII(string))
9676 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009677 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009678 PyUnicode_GET_LENGTH(string), keepends);
9679 else
9680 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009681 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009682 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 break;
9684 case PyUnicode_2BYTE_KIND:
9685 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009686 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 PyUnicode_GET_LENGTH(string), keepends);
9688 break;
9689 case PyUnicode_4BYTE_KIND:
9690 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009691 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 PyUnicode_GET_LENGTH(string), keepends);
9693 break;
9694 default:
9695 assert(0);
9696 list = 0;
9697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698 Py_DECREF(string);
9699 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700}
9701
Alexander Belopolsky40018472011-02-26 01:02:56 +00009702static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009703split(PyObject *self,
9704 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009705 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 int kind1, kind2, kind;
9708 void *buf1, *buf2;
9709 Py_ssize_t len1, len2;
9710 PyObject* out;
9711
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009713 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 if (PyUnicode_READY(self) == -1)
9716 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009719 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009721 if (PyUnicode_IS_ASCII(self))
9722 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009723 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009724 PyUnicode_GET_LENGTH(self), maxcount
9725 );
9726 else
9727 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009728 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009729 PyUnicode_GET_LENGTH(self), maxcount
9730 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 case PyUnicode_2BYTE_KIND:
9732 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009733 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 PyUnicode_GET_LENGTH(self), maxcount
9735 );
9736 case PyUnicode_4BYTE_KIND:
9737 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009738 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 PyUnicode_GET_LENGTH(self), maxcount
9740 );
9741 default:
9742 assert(0);
9743 return NULL;
9744 }
9745
9746 if (PyUnicode_READY(substring) == -1)
9747 return NULL;
9748
9749 kind1 = PyUnicode_KIND(self);
9750 kind2 = PyUnicode_KIND(substring);
9751 kind = kind1 > kind2 ? kind1 : kind2;
9752 buf1 = PyUnicode_DATA(self);
9753 buf2 = PyUnicode_DATA(substring);
9754 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009755 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 if (!buf1)
9757 return NULL;
9758 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009759 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 if (!buf2) {
9761 if (kind1 != kind) PyMem_Free(buf1);
9762 return NULL;
9763 }
9764 len1 = PyUnicode_GET_LENGTH(self);
9765 len2 = PyUnicode_GET_LENGTH(substring);
9766
Benjamin Petersonead6b532011-12-20 17:23:42 -06009767 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009769 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9770 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009771 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 else
9773 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009774 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 break;
9776 case PyUnicode_2BYTE_KIND:
9777 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 break;
9780 case PyUnicode_4BYTE_KIND:
9781 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009782 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 break;
9784 default:
9785 out = NULL;
9786 }
9787 if (kind1 != kind)
9788 PyMem_Free(buf1);
9789 if (kind2 != kind)
9790 PyMem_Free(buf2);
9791 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792}
9793
Alexander Belopolsky40018472011-02-26 01:02:56 +00009794static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009795rsplit(PyObject *self,
9796 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009797 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 int kind1, kind2, kind;
9800 void *buf1, *buf2;
9801 Py_ssize_t len1, len2;
9802 PyObject* out;
9803
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009804 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009805 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 if (PyUnicode_READY(self) == -1)
9808 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009811 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009813 if (PyUnicode_IS_ASCII(self))
9814 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009815 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009816 PyUnicode_GET_LENGTH(self), maxcount
9817 );
9818 else
9819 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009821 PyUnicode_GET_LENGTH(self), maxcount
9822 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 case PyUnicode_2BYTE_KIND:
9824 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009825 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 PyUnicode_GET_LENGTH(self), maxcount
9827 );
9828 case PyUnicode_4BYTE_KIND:
9829 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 PyUnicode_GET_LENGTH(self), maxcount
9832 );
9833 default:
9834 assert(0);
9835 return NULL;
9836 }
9837
9838 if (PyUnicode_READY(substring) == -1)
9839 return NULL;
9840
9841 kind1 = PyUnicode_KIND(self);
9842 kind2 = PyUnicode_KIND(substring);
9843 kind = kind1 > kind2 ? kind1 : kind2;
9844 buf1 = PyUnicode_DATA(self);
9845 buf2 = PyUnicode_DATA(substring);
9846 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009847 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 if (!buf1)
9849 return NULL;
9850 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009851 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 if (!buf2) {
9853 if (kind1 != kind) PyMem_Free(buf1);
9854 return NULL;
9855 }
9856 len1 = PyUnicode_GET_LENGTH(self);
9857 len2 = PyUnicode_GET_LENGTH(substring);
9858
Benjamin Petersonead6b532011-12-20 17:23:42 -06009859 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009861 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9862 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009863 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009864 else
9865 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009866 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 break;
9868 case PyUnicode_2BYTE_KIND:
9869 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009870 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 break;
9872 case PyUnicode_4BYTE_KIND:
9873 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009874 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 break;
9876 default:
9877 out = NULL;
9878 }
9879 if (kind1 != kind)
9880 PyMem_Free(buf1);
9881 if (kind2 != kind)
9882 PyMem_Free(buf2);
9883 return out;
9884}
9885
9886static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009887anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9888 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009890 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009892 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9893 return asciilib_find(buf1, len1, buf2, len2, offset);
9894 else
9895 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 case PyUnicode_2BYTE_KIND:
9897 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9898 case PyUnicode_4BYTE_KIND:
9899 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9900 }
9901 assert(0);
9902 return -1;
9903}
9904
9905static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009906anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9907 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009909 switch (kind) {
9910 case PyUnicode_1BYTE_KIND:
9911 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9912 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9913 else
9914 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9915 case PyUnicode_2BYTE_KIND:
9916 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9917 case PyUnicode_4BYTE_KIND:
9918 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9919 }
9920 assert(0);
9921 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009922}
9923
Alexander Belopolsky40018472011-02-26 01:02:56 +00009924static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925replace(PyObject *self, PyObject *str1,
9926 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 PyObject *u;
9929 char *sbuf = PyUnicode_DATA(self);
9930 char *buf1 = PyUnicode_DATA(str1);
9931 char *buf2 = PyUnicode_DATA(str2);
9932 int srelease = 0, release1 = 0, release2 = 0;
9933 int skind = PyUnicode_KIND(self);
9934 int kind1 = PyUnicode_KIND(str1);
9935 int kind2 = PyUnicode_KIND(str2);
9936 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9937 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9938 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009939 int mayshrink;
9940 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941
9942 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009945 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946
Victor Stinner59de0ee2011-10-07 10:01:28 +02009947 if (str1 == str2)
9948 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 if (skind < kind1)
9950 /* substring too wide to be present */
9951 goto nothing;
9952
Victor Stinner49a0a212011-10-12 23:46:10 +02009953 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9954 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9955 /* Replacing str1 with str2 may cause a maxchar reduction in the
9956 result string. */
9957 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009958 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009961 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009963 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009965 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009966 Py_UCS4 u1, u2;
9967 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009968 Py_ssize_t index, pos;
9969 char *src;
9970
Victor Stinner69ed0f42013-04-09 21:48:24 +02009971 u1 = PyUnicode_READ(kind1, buf1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009972 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9973 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009974 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +02009975 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009977 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009979 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009981
9982 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9983 index = 0;
9984 src = sbuf;
9985 while (--maxcount)
9986 {
9987 pos++;
9988 src += pos * PyUnicode_KIND(self);
9989 slen -= pos;
9990 index += pos;
9991 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9992 if (pos < 0)
9993 break;
9994 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9995 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009996 }
9997 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 int rkind = skind;
9999 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010000 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 if (kind1 < rkind) {
10003 /* widen substring */
10004 buf1 = _PyUnicode_AsKind(str1, rkind);
10005 if (!buf1) goto error;
10006 release1 = 1;
10007 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010008 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010009 if (i < 0)
10010 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 if (rkind > kind2) {
10012 /* widen replacement */
10013 buf2 = _PyUnicode_AsKind(str2, rkind);
10014 if (!buf2) goto error;
10015 release2 = 1;
10016 }
10017 else if (rkind < kind2) {
10018 /* widen self and buf1 */
10019 rkind = kind2;
10020 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010021 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 sbuf = _PyUnicode_AsKind(self, rkind);
10023 if (!sbuf) goto error;
10024 srelease = 1;
10025 buf1 = _PyUnicode_AsKind(str1, rkind);
10026 if (!buf1) goto error;
10027 release1 = 1;
10028 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010029 u = PyUnicode_New(slen, maxchar);
10030 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010032 assert(PyUnicode_KIND(u) == rkind);
10033 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010034
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010035 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010036 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010037 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010039 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010041
10042 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010043 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010044 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010045 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010046 if (i == -1)
10047 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010048 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010050 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010054 }
10055 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010057 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 int rkind = skind;
10059 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010062 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 buf1 = _PyUnicode_AsKind(str1, rkind);
10064 if (!buf1) goto error;
10065 release1 = 1;
10066 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010067 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010068 if (n == 0)
10069 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010071 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 buf2 = _PyUnicode_AsKind(str2, rkind);
10073 if (!buf2) goto error;
10074 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010077 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 rkind = kind2;
10079 sbuf = _PyUnicode_AsKind(self, rkind);
10080 if (!sbuf) goto error;
10081 srelease = 1;
10082 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010083 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 buf1 = _PyUnicode_AsKind(str1, rkind);
10085 if (!buf1) goto error;
10086 release1 = 1;
10087 }
10088 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10089 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010090 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 PyErr_SetString(PyExc_OverflowError,
10092 "replace string is too long");
10093 goto error;
10094 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010095 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010096 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010097 _Py_INCREF_UNICODE_EMPTY();
10098 if (!unicode_empty)
10099 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010100 u = unicode_empty;
10101 goto done;
10102 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010103 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 PyErr_SetString(PyExc_OverflowError,
10105 "replace string is too long");
10106 goto error;
10107 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010108 u = PyUnicode_New(new_size, maxchar);
10109 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010111 assert(PyUnicode_KIND(u) == rkind);
10112 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 ires = i = 0;
10114 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010115 while (n-- > 0) {
10116 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010117 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010118 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010119 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010120 if (j == -1)
10121 break;
10122 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010123 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010124 memcpy(res + rkind * ires,
10125 sbuf + rkind * i,
10126 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010128 }
10129 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010131 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010133 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010139 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010140 memcpy(res + rkind * ires,
10141 sbuf + rkind * i,
10142 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010143 }
10144 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010145 /* interleave */
10146 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010149 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010151 if (--n <= 0)
10152 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010153 memcpy(res + rkind * ires,
10154 sbuf + rkind * i,
10155 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 ires++;
10157 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010158 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010159 memcpy(res + rkind * ires,
10160 sbuf + rkind * i,
10161 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010162 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010163 }
10164
10165 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010166 unicode_adjust_maxchar(&u);
10167 if (u == NULL)
10168 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010170
10171 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (srelease)
10173 PyMem_FREE(sbuf);
10174 if (release1)
10175 PyMem_FREE(buf1);
10176 if (release2)
10177 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010178 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010180
Benjamin Peterson29060642009-01-31 22:14:21 +000010181 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010182 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 if (srelease)
10184 PyMem_FREE(sbuf);
10185 if (release1)
10186 PyMem_FREE(buf1);
10187 if (release2)
10188 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010189 return unicode_result_unchanged(self);
10190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 error:
10192 if (srelease && sbuf)
10193 PyMem_FREE(sbuf);
10194 if (release1 && buf1)
10195 PyMem_FREE(buf1);
10196 if (release2 && buf2)
10197 PyMem_FREE(buf2);
10198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199}
10200
10201/* --- Unicode Object Methods --------------------------------------------- */
10202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010203PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010204 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205\n\
10206Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010207characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208
10209static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010210unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010212 if (PyUnicode_READY(self) == -1)
10213 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010214 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215}
10216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010217PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010218 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219\n\
10220Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010221have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222
10223static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010224unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010226 if (PyUnicode_READY(self) == -1)
10227 return NULL;
10228 if (PyUnicode_GET_LENGTH(self) == 0)
10229 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010230 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231}
10232
Benjamin Petersond5890c82012-01-14 13:23:30 -050010233PyDoc_STRVAR(casefold__doc__,
10234 "S.casefold() -> str\n\
10235\n\
10236Return a version of S suitable for caseless comparisons.");
10237
10238static PyObject *
10239unicode_casefold(PyObject *self)
10240{
10241 if (PyUnicode_READY(self) == -1)
10242 return NULL;
10243 if (PyUnicode_IS_ASCII(self))
10244 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010245 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010246}
10247
10248
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010249/* Argument converter. Coerces to a single unicode character */
10250
10251static int
10252convert_uc(PyObject *obj, void *addr)
10253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010255 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010256
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 uniobj = PyUnicode_FromObject(obj);
10258 if (uniobj == NULL) {
10259 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010261 return 0;
10262 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010264 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010265 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010266 Py_DECREF(uniobj);
10267 return 0;
10268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010270 Py_DECREF(uniobj);
10271 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010272}
10273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010274PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010275 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010277Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010278done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279
10280static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010281unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010283 Py_ssize_t marg, left;
10284 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 Py_UCS4 fillchar = ' ';
10286
Victor Stinnere9a29352011-10-01 02:14:59 +020010287 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289
Benjamin Petersonbac79492012-01-14 13:34:47 -050010290 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291 return NULL;
10292
Victor Stinnerc4b49542011-12-11 22:44:26 +010010293 if (PyUnicode_GET_LENGTH(self) >= width)
10294 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295
Victor Stinnerc4b49542011-12-11 22:44:26 +010010296 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297 left = marg / 2 + (marg & width & 1);
10298
Victor Stinner9310abb2011-10-05 00:59:23 +020010299 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300}
10301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302/* This function assumes that str1 and str2 are readied by the caller. */
10303
Marc-André Lemburge5034372000-08-08 08:04:29 +000010304static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010305unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010306{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010307#define COMPARE(TYPE1, TYPE2) \
10308 do { \
10309 TYPE1* p1 = (TYPE1 *)data1; \
10310 TYPE2* p2 = (TYPE2 *)data2; \
10311 TYPE1* end = p1 + len; \
10312 Py_UCS4 c1, c2; \
10313 for (; p1 != end; p1++, p2++) { \
10314 c1 = *p1; \
10315 c2 = *p2; \
10316 if (c1 != c2) \
10317 return (c1 < c2) ? -1 : 1; \
10318 } \
10319 } \
10320 while (0)
10321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 int kind1, kind2;
10323 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010324 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010325
Victor Stinner90db9c42012-10-04 21:53:50 +020010326 /* a string is equal to itself */
10327 if (str1 == str2)
10328 return 0;
10329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 kind1 = PyUnicode_KIND(str1);
10331 kind2 = PyUnicode_KIND(str2);
10332 data1 = PyUnicode_DATA(str1);
10333 data2 = PyUnicode_DATA(str2);
10334 len1 = PyUnicode_GET_LENGTH(str1);
10335 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010336 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010337
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010338 switch(kind1) {
10339 case PyUnicode_1BYTE_KIND:
10340 {
10341 switch(kind2) {
10342 case PyUnicode_1BYTE_KIND:
10343 {
10344 int cmp = memcmp(data1, data2, len);
10345 /* normalize result of memcmp() into the range [-1; 1] */
10346 if (cmp < 0)
10347 return -1;
10348 if (cmp > 0)
10349 return 1;
10350 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010351 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010352 case PyUnicode_2BYTE_KIND:
10353 COMPARE(Py_UCS1, Py_UCS2);
10354 break;
10355 case PyUnicode_4BYTE_KIND:
10356 COMPARE(Py_UCS1, Py_UCS4);
10357 break;
10358 default:
10359 assert(0);
10360 }
10361 break;
10362 }
10363 case PyUnicode_2BYTE_KIND:
10364 {
10365 switch(kind2) {
10366 case PyUnicode_1BYTE_KIND:
10367 COMPARE(Py_UCS2, Py_UCS1);
10368 break;
10369 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010370 {
10371#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 2
10372 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10373 /* normalize result of wmemcmp() into the range [-1; 1] */
10374 if (cmp < 0)
10375 return -1;
10376 if (cmp > 0)
10377 return 1;
10378#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010379 COMPARE(Py_UCS2, Py_UCS2);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010380#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010381 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010382 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010383 case PyUnicode_4BYTE_KIND:
10384 COMPARE(Py_UCS2, Py_UCS4);
10385 break;
10386 default:
10387 assert(0);
10388 }
10389 break;
10390 }
10391 case PyUnicode_4BYTE_KIND:
10392 {
10393 switch(kind2) {
10394 case PyUnicode_1BYTE_KIND:
10395 COMPARE(Py_UCS4, Py_UCS1);
10396 break;
10397 case PyUnicode_2BYTE_KIND:
10398 COMPARE(Py_UCS4, Py_UCS2);
10399 break;
10400 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010401 {
10402#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10403 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10404 /* normalize result of wmemcmp() into the range [-1; 1] */
10405 if (cmp < 0)
10406 return -1;
10407 if (cmp > 0)
10408 return 1;
10409#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010410 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010411#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010412 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010413 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010414 default:
10415 assert(0);
10416 }
10417 break;
10418 }
10419 default:
10420 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010421 }
10422
Victor Stinner770e19e2012-10-04 22:59:45 +020010423 if (len1 == len2)
10424 return 0;
10425 if (len1 < len2)
10426 return -1;
10427 else
10428 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010429
10430#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010431}
10432
Victor Stinnere5567ad2012-10-23 02:48:49 +020010433static int
10434unicode_compare_eq(PyObject *str1, PyObject *str2)
10435{
10436 int kind;
10437 void *data1, *data2;
10438 Py_ssize_t len;
10439 int cmp;
10440
10441 /* a string is equal to itself */
10442 if (str1 == str2)
10443 return 1;
10444
10445 len = PyUnicode_GET_LENGTH(str1);
10446 if (PyUnicode_GET_LENGTH(str2) != len)
10447 return 0;
10448 kind = PyUnicode_KIND(str1);
10449 if (PyUnicode_KIND(str2) != kind)
10450 return 0;
10451 data1 = PyUnicode_DATA(str1);
10452 data2 = PyUnicode_DATA(str2);
10453
10454 cmp = memcmp(data1, data2, len * kind);
10455 return (cmp == 0);
10456}
10457
10458
Alexander Belopolsky40018472011-02-26 01:02:56 +000010459int
10460PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10463 if (PyUnicode_READY(left) == -1 ||
10464 PyUnicode_READY(right) == -1)
10465 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010466 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010468 PyErr_Format(PyExc_TypeError,
10469 "Can't compare %.100s and %.100s",
10470 left->ob_type->tp_name,
10471 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472 return -1;
10473}
10474
Martin v. Löwis5b222132007-06-10 09:51:05 +000010475int
10476PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 Py_ssize_t i;
10479 int kind;
10480 void *data;
10481 Py_UCS4 chr;
10482
Victor Stinner910337b2011-10-03 03:20:16 +020010483 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 if (PyUnicode_READY(uni) == -1)
10485 return -1;
10486 kind = PyUnicode_KIND(uni);
10487 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010488 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10490 if (chr != str[i])
10491 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010492 /* This check keeps Python strings that end in '\0' from comparing equal
10493 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010496 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010497 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010498 return 0;
10499}
10500
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010501
Benjamin Peterson29060642009-01-31 22:14:21 +000010502#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010503 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010504
Alexander Belopolsky40018472011-02-26 01:02:56 +000010505PyObject *
10506PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010507{
10508 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010509 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010510
Victor Stinnere5567ad2012-10-23 02:48:49 +020010511 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10512 Py_RETURN_NOTIMPLEMENTED;
10513
10514 if (PyUnicode_READY(left) == -1 ||
10515 PyUnicode_READY(right) == -1)
10516 return NULL;
10517
10518 if (op == Py_EQ || op == Py_NE) {
10519 result = unicode_compare_eq(left, right);
10520 if (op == Py_EQ)
10521 v = TEST_COND(result);
10522 else
10523 v = TEST_COND(!result);
10524 }
10525 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010526 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010527
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010528 /* Convert the return value to a Boolean */
10529 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010530 case Py_LE:
10531 v = TEST_COND(result <= 0);
10532 break;
10533 case Py_GE:
10534 v = TEST_COND(result >= 0);
10535 break;
10536 case Py_LT:
10537 v = TEST_COND(result == -1);
10538 break;
10539 case Py_GT:
10540 v = TEST_COND(result == 1);
10541 break;
10542 default:
10543 PyErr_BadArgument();
10544 return NULL;
10545 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010546 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010547 Py_INCREF(v);
10548 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010549}
10550
Alexander Belopolsky40018472011-02-26 01:02:56 +000010551int
10552PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010553{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010554 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 int kind1, kind2, kind;
10556 void *buf1, *buf2;
10557 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010558 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010559
10560 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010561 sub = PyUnicode_FromObject(element);
10562 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010563 PyErr_Format(PyExc_TypeError,
10564 "'in <string>' requires string as left operand, not %s",
10565 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010566 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010567 }
10568
Thomas Wouters477c8d52006-05-27 19:21:47 +000010569 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010570 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010571 Py_DECREF(sub);
10572 return -1;
10573 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010574 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10575 Py_DECREF(sub);
10576 Py_DECREF(str);
10577 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 kind1 = PyUnicode_KIND(str);
10580 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010581 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 buf1 = PyUnicode_DATA(str);
10583 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010584 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010585 if (kind2 > kind) {
10586 Py_DECREF(sub);
10587 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010588 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010589 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010590 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 if (!buf2) {
10593 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010594 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 return -1;
10596 }
10597 len1 = PyUnicode_GET_LENGTH(str);
10598 len2 = PyUnicode_GET_LENGTH(sub);
10599
Benjamin Petersonead6b532011-12-20 17:23:42 -060010600 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 case PyUnicode_1BYTE_KIND:
10602 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10603 break;
10604 case PyUnicode_2BYTE_KIND:
10605 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10606 break;
10607 case PyUnicode_4BYTE_KIND:
10608 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10609 break;
10610 default:
10611 result = -1;
10612 assert(0);
10613 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614
10615 Py_DECREF(str);
10616 Py_DECREF(sub);
10617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (kind2 != kind)
10619 PyMem_Free(buf2);
10620
Guido van Rossum403d68b2000-03-13 15:55:09 +000010621 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010622}
10623
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624/* Concat to string or Unicode object giving a new Unicode object. */
10625
Alexander Belopolsky40018472011-02-26 01:02:56 +000010626PyObject *
10627PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010630 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010631 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632
10633 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010636 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010639 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640
10641 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010642 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010643 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010646 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649 }
10650
Victor Stinner488fa492011-12-12 00:01:39 +010010651 u_len = PyUnicode_GET_LENGTH(u);
10652 v_len = PyUnicode_GET_LENGTH(v);
10653 if (u_len > PY_SSIZE_T_MAX - v_len) {
10654 PyErr_SetString(PyExc_OverflowError,
10655 "strings are too large to concat");
10656 goto onError;
10657 }
10658 new_len = u_len + v_len;
10659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010661 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010662 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010665 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010668 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10669 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670 Py_DECREF(u);
10671 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010672 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676 Py_XDECREF(u);
10677 Py_XDECREF(v);
10678 return NULL;
10679}
10680
Walter Dörwald1ab83302007-05-18 17:15:44 +000010681void
Victor Stinner23e56682011-10-03 03:54:37 +020010682PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010683{
Victor Stinner23e56682011-10-03 03:54:37 +020010684 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010685 Py_UCS4 maxchar, maxchar2;
10686 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010687
10688 if (p_left == NULL) {
10689 if (!PyErr_Occurred())
10690 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010691 return;
10692 }
Victor Stinner23e56682011-10-03 03:54:37 +020010693 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010694 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010695 if (!PyErr_Occurred())
10696 PyErr_BadInternalCall();
10697 goto error;
10698 }
10699
Benjamin Petersonbac79492012-01-14 13:34:47 -050010700 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010701 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010702 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010703 goto error;
10704
Victor Stinner488fa492011-12-12 00:01:39 +010010705 /* Shortcuts */
10706 if (left == unicode_empty) {
10707 Py_DECREF(left);
10708 Py_INCREF(right);
10709 *p_left = right;
10710 return;
10711 }
10712 if (right == unicode_empty)
10713 return;
10714
10715 left_len = PyUnicode_GET_LENGTH(left);
10716 right_len = PyUnicode_GET_LENGTH(right);
10717 if (left_len > PY_SSIZE_T_MAX - right_len) {
10718 PyErr_SetString(PyExc_OverflowError,
10719 "strings are too large to concat");
10720 goto error;
10721 }
10722 new_len = left_len + right_len;
10723
10724 if (unicode_modifiable(left)
10725 && PyUnicode_CheckExact(right)
10726 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010727 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10728 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010729 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010730 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010731 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10732 {
10733 /* append inplace */
10734 if (unicode_resize(p_left, new_len) != 0) {
10735 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10736 * deallocated so it cannot be put back into
10737 * 'variable'. The MemoryError is raised when there
10738 * is no value in 'variable', which might (very
10739 * remotely) be a cause of incompatibilities.
10740 */
10741 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010742 }
Victor Stinner488fa492011-12-12 00:01:39 +010010743 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010744 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010745 }
Victor Stinner488fa492011-12-12 00:01:39 +010010746 else {
10747 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10748 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010749 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010750
Victor Stinner488fa492011-12-12 00:01:39 +010010751 /* Concat the two Unicode strings */
10752 res = PyUnicode_New(new_len, maxchar);
10753 if (res == NULL)
10754 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010755 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10756 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010757 Py_DECREF(left);
10758 *p_left = res;
10759 }
10760 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010761 return;
10762
10763error:
Victor Stinner488fa492011-12-12 00:01:39 +010010764 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010765}
10766
10767void
10768PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10769{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010770 PyUnicode_Append(pleft, right);
10771 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010772}
10773
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010774PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010775 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010777Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010778string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010779interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780
10781static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010782unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010784 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010785 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010786 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 int kind1, kind2, kind;
10789 void *buf1, *buf2;
10790 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791
Jesus Ceaac451502011-04-20 17:09:23 +020010792 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10793 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010794 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 kind1 = PyUnicode_KIND(self);
10797 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010798 if (kind2 > kind1)
10799 return PyLong_FromLong(0);
10800 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 buf1 = PyUnicode_DATA(self);
10802 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010804 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 if (!buf2) {
10806 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 return NULL;
10808 }
10809 len1 = PyUnicode_GET_LENGTH(self);
10810 len2 = PyUnicode_GET_LENGTH(substring);
10811
10812 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010813 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 case PyUnicode_1BYTE_KIND:
10815 iresult = ucs1lib_count(
10816 ((Py_UCS1*)buf1) + start, end - start,
10817 buf2, len2, PY_SSIZE_T_MAX
10818 );
10819 break;
10820 case PyUnicode_2BYTE_KIND:
10821 iresult = ucs2lib_count(
10822 ((Py_UCS2*)buf1) + start, end - start,
10823 buf2, len2, PY_SSIZE_T_MAX
10824 );
10825 break;
10826 case PyUnicode_4BYTE_KIND:
10827 iresult = ucs4lib_count(
10828 ((Py_UCS4*)buf1) + start, end - start,
10829 buf2, len2, PY_SSIZE_T_MAX
10830 );
10831 break;
10832 default:
10833 assert(0); iresult = 0;
10834 }
10835
10836 result = PyLong_FromSsize_t(iresult);
10837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 if (kind2 != kind)
10839 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840
10841 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010842
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 return result;
10844}
10845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010846PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010847 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010849Encode S using the codec registered for encoding. Default encoding\n\
10850is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010851handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010852a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10853'xmlcharrefreplace' as well as any other name registered with\n\
10854codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855
10856static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010857unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010859 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860 char *encoding = NULL;
10861 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010862
Benjamin Peterson308d6372009-09-18 21:42:35 +000010863 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10864 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010866 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010867}
10868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010869PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010870 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871\n\
10872Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010873If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874
10875static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010876unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010878 Py_ssize_t i, j, line_pos, src_len, incr;
10879 Py_UCS4 ch;
10880 PyObject *u;
10881 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010883 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010884 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885
10886 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888
Antoine Pitrou22425222011-10-04 19:10:51 +020010889 if (PyUnicode_READY(self) == -1)
10890 return NULL;
10891
Thomas Wouters7e474022000-07-16 12:04:32 +000010892 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010893 src_len = PyUnicode_GET_LENGTH(self);
10894 i = j = line_pos = 0;
10895 kind = PyUnicode_KIND(self);
10896 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010897 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010898 for (; i < src_len; i++) {
10899 ch = PyUnicode_READ(kind, src_data, i);
10900 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010901 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010903 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010904 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 goto overflow;
10906 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010907 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010908 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010912 goto overflow;
10913 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010915 if (ch == '\n' || ch == '\r')
10916 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010918 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010919 if (!found)
10920 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010921
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010923 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924 if (!u)
10925 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010926 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
Antoine Pitroue71d5742011-10-04 15:55:09 +020010928 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929
Antoine Pitroue71d5742011-10-04 15:55:09 +020010930 for (; i < src_len; i++) {
10931 ch = PyUnicode_READ(kind, src_data, i);
10932 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010933 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010934 incr = tabsize - (line_pos % tabsize);
10935 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010936 FILL(kind, dest_data, ' ', j, incr);
10937 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010938 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010939 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010941 line_pos++;
10942 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010943 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010944 if (ch == '\n' || ch == '\r')
10945 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010947 }
10948 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010949 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010950
Antoine Pitroue71d5742011-10-04 15:55:09 +020010951 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010952 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10953 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954}
10955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010956PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010957 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958\n\
10959Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010960such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961arguments start and end are interpreted as in slice notation.\n\
10962\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010963Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
10965static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010968 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010969 Py_ssize_t start;
10970 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010971 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972
Jesus Ceaac451502011-04-20 17:09:23 +020010973 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10974 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 if (PyUnicode_READY(self) == -1)
10978 return NULL;
10979 if (PyUnicode_READY(substring) == -1)
10980 return NULL;
10981
Victor Stinner7931d9a2011-11-04 00:22:48 +010010982 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
10984 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 if (result == -2)
10987 return NULL;
10988
Christian Heimes217cfd12007-12-02 14:31:20 +000010989 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990}
10991
10992static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010993unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010995 void *data;
10996 enum PyUnicode_Kind kind;
10997 Py_UCS4 ch;
10998 PyObject *res;
10999
11000 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11001 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011003 }
11004 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11005 PyErr_SetString(PyExc_IndexError, "string index out of range");
11006 return NULL;
11007 }
11008 kind = PyUnicode_KIND(self);
11009 data = PyUnicode_DATA(self);
11010 ch = PyUnicode_READ(kind, data, index);
11011 if (ch < 256)
11012 return get_latin1_char(ch);
11013
11014 res = PyUnicode_New(1, ch);
11015 if (res == NULL)
11016 return NULL;
11017 kind = PyUnicode_KIND(res);
11018 data = PyUnicode_DATA(res);
11019 PyUnicode_WRITE(kind, data, 0, ch);
11020 assert(_PyUnicode_CheckConsistency(res, 1));
11021 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022}
11023
Guido van Rossumc2504932007-09-18 19:42:40 +000011024/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011025 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011026static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011027unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028{
Guido van Rossumc2504932007-09-18 19:42:40 +000011029 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011030 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011031
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011032#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011033 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011034#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 if (_PyUnicode_HASH(self) != -1)
11036 return _PyUnicode_HASH(self);
11037 if (PyUnicode_READY(self) == -1)
11038 return -1;
11039 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011040 /*
11041 We make the hash of the empty string be 0, rather than using
11042 (prefix ^ suffix), since this slightly obfuscates the hash secret
11043 */
11044 if (len == 0) {
11045 _PyUnicode_HASH(self) = 0;
11046 return 0;
11047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048
11049 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011050#define HASH(P) \
11051 x ^= (Py_uhash_t) *P << 7; \
11052 while (--len >= 0) \
11053 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054
Georg Brandl2fb477c2012-02-21 00:33:36 +010011055 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011056 switch (PyUnicode_KIND(self)) {
11057 case PyUnicode_1BYTE_KIND: {
11058 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11059 HASH(c);
11060 break;
11061 }
11062 case PyUnicode_2BYTE_KIND: {
11063 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11064 HASH(s);
11065 break;
11066 }
11067 default: {
11068 Py_UCS4 *l;
11069 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11070 "Impossible switch case in unicode_hash");
11071 l = PyUnicode_4BYTE_DATA(self);
11072 HASH(l);
11073 break;
11074 }
11075 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011076 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11077 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011078
Guido van Rossumc2504932007-09-18 19:42:40 +000011079 if (x == -1)
11080 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011082 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011086PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011087 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011089Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090
11091static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011094 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011095 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011096 Py_ssize_t start;
11097 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098
Jesus Ceaac451502011-04-20 17:09:23 +020011099 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11100 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 if (PyUnicode_READY(self) == -1)
11104 return NULL;
11105 if (PyUnicode_READY(substring) == -1)
11106 return NULL;
11107
Victor Stinner7931d9a2011-11-04 00:22:48 +010011108 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
11110 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 if (result == -2)
11113 return NULL;
11114
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 if (result < 0) {
11116 PyErr_SetString(PyExc_ValueError, "substring not found");
11117 return NULL;
11118 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011119
Christian Heimes217cfd12007-12-02 14:31:20 +000011120 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121}
11122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011123PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011124 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011126Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011127at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128
11129static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011130unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 Py_ssize_t i, length;
11133 int kind;
11134 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135 int cased;
11136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011137 if (PyUnicode_READY(self) == -1)
11138 return NULL;
11139 length = PyUnicode_GET_LENGTH(self);
11140 kind = PyUnicode_KIND(self);
11141 data = PyUnicode_DATA(self);
11142
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 if (length == 1)
11145 return PyBool_FromLong(
11146 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011148 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011151
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 for (i = 0; i < length; i++) {
11154 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011155
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11157 return PyBool_FromLong(0);
11158 else if (!cased && Py_UNICODE_ISLOWER(ch))
11159 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011161 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162}
11163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011164PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011165 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011167Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011168at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169
11170static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011171unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 Py_ssize_t i, length;
11174 int kind;
11175 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176 int cased;
11177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 if (PyUnicode_READY(self) == -1)
11179 return NULL;
11180 length = PyUnicode_GET_LENGTH(self);
11181 kind = PyUnicode_KIND(self);
11182 data = PyUnicode_DATA(self);
11183
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 if (length == 1)
11186 return PyBool_FromLong(
11187 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011189 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011192
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 for (i = 0; i < length; i++) {
11195 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011196
Benjamin Peterson29060642009-01-31 22:14:21 +000011197 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11198 return PyBool_FromLong(0);
11199 else if (!cased && Py_UNICODE_ISUPPER(ch))
11200 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011202 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203}
11204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011205PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011206 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011208Return True if S is a titlecased string and there is at least one\n\
11209character in S, i.e. upper- and titlecase characters may only\n\
11210follow uncased characters and lowercase characters only cased ones.\n\
11211Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
11213static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011214unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 Py_ssize_t i, length;
11217 int kind;
11218 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 int cased, previous_is_cased;
11220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 if (PyUnicode_READY(self) == -1)
11222 return NULL;
11223 length = PyUnicode_GET_LENGTH(self);
11224 kind = PyUnicode_KIND(self);
11225 data = PyUnicode_DATA(self);
11226
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 if (length == 1) {
11229 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11230 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11231 (Py_UNICODE_ISUPPER(ch) != 0));
11232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011234 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011237
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238 cased = 0;
11239 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 for (i = 0; i < length; i++) {
11241 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011242
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11244 if (previous_is_cased)
11245 return PyBool_FromLong(0);
11246 previous_is_cased = 1;
11247 cased = 1;
11248 }
11249 else if (Py_UNICODE_ISLOWER(ch)) {
11250 if (!previous_is_cased)
11251 return PyBool_FromLong(0);
11252 previous_is_cased = 1;
11253 cased = 1;
11254 }
11255 else
11256 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011258 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259}
11260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011261PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011264Return True if all characters in S are whitespace\n\
11265and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266
11267static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011268unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 Py_ssize_t i, length;
11271 int kind;
11272 void *data;
11273
11274 if (PyUnicode_READY(self) == -1)
11275 return NULL;
11276 length = PyUnicode_GET_LENGTH(self);
11277 kind = PyUnicode_KIND(self);
11278 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 if (length == 1)
11282 return PyBool_FromLong(
11283 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011285 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 for (i = 0; i < length; i++) {
11290 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011291 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011294 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295}
11296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011297PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011300Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011301and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011302
11303static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011304unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 Py_ssize_t i, length;
11307 int kind;
11308 void *data;
11309
11310 if (PyUnicode_READY(self) == -1)
11311 return NULL;
11312 length = PyUnicode_GET_LENGTH(self);
11313 kind = PyUnicode_KIND(self);
11314 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011315
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011316 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 if (length == 1)
11318 return PyBool_FromLong(
11319 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011320
11321 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 for (i = 0; i < length; i++) {
11326 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011327 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011328 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011329 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330}
11331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011332PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011333 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011334\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011335Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011336and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011337
11338static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011339unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 int kind;
11342 void *data;
11343 Py_ssize_t len, i;
11344
11345 if (PyUnicode_READY(self) == -1)
11346 return NULL;
11347
11348 kind = PyUnicode_KIND(self);
11349 data = PyUnicode_DATA(self);
11350 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011351
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011352 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 if (len == 1) {
11354 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11355 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11356 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011357
11358 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011360 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 for (i = 0; i < len; i++) {
11363 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011364 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011366 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011367 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011368}
11369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011370PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011373Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011374False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375
11376static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011377unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 Py_ssize_t i, length;
11380 int kind;
11381 void *data;
11382
11383 if (PyUnicode_READY(self) == -1)
11384 return NULL;
11385 length = PyUnicode_GET_LENGTH(self);
11386 kind = PyUnicode_KIND(self);
11387 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 if (length == 1)
11391 return PyBool_FromLong(
11392 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011394 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 for (i = 0; i < length; i++) {
11399 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011402 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403}
11404
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011405PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011406 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011408Return True if all characters in S are digits\n\
11409and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410
11411static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011412unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 Py_ssize_t i, length;
11415 int kind;
11416 void *data;
11417
11418 if (PyUnicode_READY(self) == -1)
11419 return NULL;
11420 length = PyUnicode_GET_LENGTH(self);
11421 kind = PyUnicode_KIND(self);
11422 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 if (length == 1) {
11426 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11427 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011430 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 for (i = 0; i < length; i++) {
11435 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011438 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439}
11440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011441PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011442 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011444Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011445False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
11447static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011448unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 Py_ssize_t i, length;
11451 int kind;
11452 void *data;
11453
11454 if (PyUnicode_READY(self) == -1)
11455 return NULL;
11456 length = PyUnicode_GET_LENGTH(self);
11457 kind = PyUnicode_KIND(self);
11458 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 if (length == 1)
11462 return PyBool_FromLong(
11463 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011465 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 for (i = 0; i < length; i++) {
11470 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011473 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474}
11475
Martin v. Löwis47383402007-08-15 07:32:56 +000011476int
11477PyUnicode_IsIdentifier(PyObject *self)
11478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 int kind;
11480 void *data;
11481 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011482 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 if (PyUnicode_READY(self) == -1) {
11485 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 }
11488
11489 /* Special case for empty strings */
11490 if (PyUnicode_GET_LENGTH(self) == 0)
11491 return 0;
11492 kind = PyUnicode_KIND(self);
11493 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011494
11495 /* PEP 3131 says that the first character must be in
11496 XID_Start and subsequent characters in XID_Continue,
11497 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011498 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011499 letters, digits, underscore). However, given the current
11500 definition of XID_Start and XID_Continue, it is sufficient
11501 to check just for these, except that _ must be allowed
11502 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011504 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011505 return 0;
11506
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011507 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011510 return 1;
11511}
11512
11513PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011515\n\
11516Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011517to the language definition.\n\
11518\n\
11519Use keyword.iskeyword() to test for reserved identifiers\n\
11520such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011521
11522static PyObject*
11523unicode_isidentifier(PyObject *self)
11524{
11525 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11526}
11527
Georg Brandl559e5d72008-06-11 18:37:52 +000011528PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011530\n\
11531Return True if all characters in S are considered\n\
11532printable in repr() or S is empty, False otherwise.");
11533
11534static PyObject*
11535unicode_isprintable(PyObject *self)
11536{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 Py_ssize_t i, length;
11538 int kind;
11539 void *data;
11540
11541 if (PyUnicode_READY(self) == -1)
11542 return NULL;
11543 length = PyUnicode_GET_LENGTH(self);
11544 kind = PyUnicode_KIND(self);
11545 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011546
11547 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 if (length == 1)
11549 return PyBool_FromLong(
11550 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 for (i = 0; i < length; i++) {
11553 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011554 Py_RETURN_FALSE;
11555 }
11556 }
11557 Py_RETURN_TRUE;
11558}
11559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011560PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011561 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562\n\
11563Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011564iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565
11566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011567unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011569 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570}
11571
Martin v. Löwis18e16552006-02-15 17:27:45 +000011572static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011573unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 if (PyUnicode_READY(self) == -1)
11576 return -1;
11577 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578}
11579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011580PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011583Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011584done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
11586static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011587unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011589 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 Py_UCS4 fillchar = ' ';
11591
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011592 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593 return NULL;
11594
Benjamin Petersonbac79492012-01-14 13:34:47 -050011595 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597
Victor Stinnerc4b49542011-12-11 22:44:26 +010011598 if (PyUnicode_GET_LENGTH(self) >= width)
11599 return unicode_result_unchanged(self);
11600
11601 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602}
11603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011607Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608
11609static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011610unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011612 if (PyUnicode_READY(self) == -1)
11613 return NULL;
11614 if (PyUnicode_IS_ASCII(self))
11615 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011616 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617}
11618
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011619#define LEFTSTRIP 0
11620#define RIGHTSTRIP 1
11621#define BOTHSTRIP 2
11622
11623/* Arrays indexed by above */
11624static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11625
11626#define STRIPNAME(i) (stripformat[i]+3)
11627
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011628/* externally visible for str.strip(unicode) */
11629PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011630_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 void *data;
11633 int kind;
11634 Py_ssize_t i, j, len;
11635 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11638 return NULL;
11639
11640 kind = PyUnicode_KIND(self);
11641 data = PyUnicode_DATA(self);
11642 len = PyUnicode_GET_LENGTH(self);
11643 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11644 PyUnicode_DATA(sepobj),
11645 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011646
Benjamin Peterson14339b62009-01-31 16:36:08 +000011647 i = 0;
11648 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 while (i < len &&
11650 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 i++;
11652 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011653 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011654
Benjamin Peterson14339b62009-01-31 16:36:08 +000011655 j = len;
11656 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 do {
11658 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 } while (j >= i &&
11660 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011662 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011663
Victor Stinner7931d9a2011-11-04 00:22:48 +010011664 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665}
11666
11667PyObject*
11668PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11669{
11670 unsigned char *data;
11671 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011672 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673
Victor Stinnerde636f32011-10-01 03:55:54 +020011674 if (PyUnicode_READY(self) == -1)
11675 return NULL;
11676
Victor Stinner684d5fd2012-05-03 02:32:34 +020011677 length = PyUnicode_GET_LENGTH(self);
11678 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011679
Victor Stinner684d5fd2012-05-03 02:32:34 +020011680 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011681 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682
Victor Stinnerde636f32011-10-01 03:55:54 +020011683 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011684 PyErr_SetString(PyExc_IndexError, "string index out of range");
11685 return NULL;
11686 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011687 if (start >= length || end < start)
11688 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011689
Victor Stinner684d5fd2012-05-03 02:32:34 +020011690 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011691 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011692 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011693 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011694 }
11695 else {
11696 kind = PyUnicode_KIND(self);
11697 data = PyUnicode_1BYTE_DATA(self);
11698 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011699 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011700 length);
11701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
11704static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011705do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 int kind;
11708 void *data;
11709 Py_ssize_t len, i, j;
11710
11711 if (PyUnicode_READY(self) == -1)
11712 return NULL;
11713
11714 kind = PyUnicode_KIND(self);
11715 data = PyUnicode_DATA(self);
11716 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011717
Benjamin Peterson14339b62009-01-31 16:36:08 +000011718 i = 0;
11719 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011721 i++;
11722 }
11723 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011724
Benjamin Peterson14339b62009-01-31 16:36:08 +000011725 j = len;
11726 if (striptype != LEFTSTRIP) {
11727 do {
11728 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011730 j++;
11731 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011732
Victor Stinner7931d9a2011-11-04 00:22:48 +010011733 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734}
11735
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011736
11737static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011738do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011739{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011740 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011741
Benjamin Peterson14339b62009-01-31 16:36:08 +000011742 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11743 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011744
Benjamin Peterson14339b62009-01-31 16:36:08 +000011745 if (sep != NULL && sep != Py_None) {
11746 if (PyUnicode_Check(sep))
11747 return _PyUnicode_XStrip(self, striptype, sep);
11748 else {
11749 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 "%s arg must be None or str",
11751 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011752 return NULL;
11753 }
11754 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755
Benjamin Peterson14339b62009-01-31 16:36:08 +000011756 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757}
11758
11759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011760PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011762\n\
11763Return a copy of the string S with leading and trailing\n\
11764whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011765If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011766
11767static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011768unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011769{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011770 if (PyTuple_GET_SIZE(args) == 0)
11771 return do_strip(self, BOTHSTRIP); /* Common case */
11772 else
11773 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774}
11775
11776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011777PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779\n\
11780Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011781If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782
11783static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011784unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011785{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011786 if (PyTuple_GET_SIZE(args) == 0)
11787 return do_strip(self, LEFTSTRIP); /* Common case */
11788 else
11789 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011790}
11791
11792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011793PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011795\n\
11796Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011797If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011798
11799static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011800unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011801{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011802 if (PyTuple_GET_SIZE(args) == 0)
11803 return do_strip(self, RIGHTSTRIP); /* Common case */
11804 else
11805 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011806}
11807
11808
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011810unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011812 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
Serhiy Storchaka05997252013-01-26 12:14:02 +020011815 if (len < 1)
11816 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817
Victor Stinnerc4b49542011-12-11 22:44:26 +010011818 /* no repeat, return original string */
11819 if (len == 1)
11820 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011821
Benjamin Petersonbac79492012-01-14 13:34:47 -050011822 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 return NULL;
11824
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011825 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011826 PyErr_SetString(PyExc_OverflowError,
11827 "repeated string is too long");
11828 return NULL;
11829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011831
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011832 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833 if (!u)
11834 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011835 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (PyUnicode_GET_LENGTH(str) == 1) {
11838 const int kind = PyUnicode_KIND(str);
11839 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011840 if (kind == PyUnicode_1BYTE_KIND) {
11841 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011842 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011843 }
11844 else if (kind == PyUnicode_2BYTE_KIND) {
11845 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011846 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011847 ucs2[n] = fill_char;
11848 } else {
11849 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11850 assert(kind == PyUnicode_4BYTE_KIND);
11851 for (n = 0; n < len; ++n)
11852 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 }
11855 else {
11856 /* number of characters copied this far */
11857 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011858 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 char *to = (char *) PyUnicode_DATA(u);
11860 Py_MEMCPY(to, PyUnicode_DATA(str),
11861 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 n = (done <= nchars-done) ? done : nchars-done;
11864 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011865 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867 }
11868
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011869 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011870 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871}
11872
Alexander Belopolsky40018472011-02-26 01:02:56 +000011873PyObject *
11874PyUnicode_Replace(PyObject *obj,
11875 PyObject *subobj,
11876 PyObject *replobj,
11877 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878{
11879 PyObject *self;
11880 PyObject *str1;
11881 PyObject *str2;
11882 PyObject *result;
11883
11884 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011885 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011888 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 Py_DECREF(self);
11890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 }
11892 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011893 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011894 Py_DECREF(self);
11895 Py_DECREF(str1);
11896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011898 if (PyUnicode_READY(self) == -1 ||
11899 PyUnicode_READY(str1) == -1 ||
11900 PyUnicode_READY(str2) == -1)
11901 result = NULL;
11902 else
11903 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 Py_DECREF(self);
11905 Py_DECREF(str1);
11906 Py_DECREF(str2);
11907 return result;
11908}
11909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011910PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011911 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912\n\
11913Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011914old replaced by new. If the optional argument count is\n\
11915given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916
11917static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 PyObject *str1;
11921 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011922 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 PyObject *result;
11924
Martin v. Löwis18e16552006-02-15 17:27:45 +000011925 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011927 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011930 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 return NULL;
11932 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011933 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 Py_DECREF(str1);
11935 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011936 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011937 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11938 result = NULL;
11939 else
11940 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941
11942 Py_DECREF(str1);
11943 Py_DECREF(str2);
11944 return result;
11945}
11946
Alexander Belopolsky40018472011-02-26 01:02:56 +000011947static PyObject *
11948unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011950 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 Py_ssize_t isize;
11952 Py_ssize_t osize, squote, dquote, i, o;
11953 Py_UCS4 max, quote;
11954 int ikind, okind;
11955 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011958 return NULL;
11959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 isize = PyUnicode_GET_LENGTH(unicode);
11961 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 /* Compute length of output, quote characters, and
11964 maximum character */
11965 osize = 2; /* quotes */
11966 max = 127;
11967 squote = dquote = 0;
11968 ikind = PyUnicode_KIND(unicode);
11969 for (i = 0; i < isize; i++) {
11970 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11971 switch (ch) {
11972 case '\'': squote++; osize++; break;
11973 case '"': dquote++; osize++; break;
11974 case '\\': case '\t': case '\r': case '\n':
11975 osize += 2; break;
11976 default:
11977 /* Fast-path ASCII */
11978 if (ch < ' ' || ch == 0x7f)
11979 osize += 4; /* \xHH */
11980 else if (ch < 0x7f)
11981 osize++;
11982 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11983 osize++;
11984 max = ch > max ? ch : max;
11985 }
11986 else if (ch < 0x100)
11987 osize += 4; /* \xHH */
11988 else if (ch < 0x10000)
11989 osize += 6; /* \uHHHH */
11990 else
11991 osize += 10; /* \uHHHHHHHH */
11992 }
11993 }
11994
11995 quote = '\'';
11996 if (squote) {
11997 if (dquote)
11998 /* Both squote and dquote present. Use squote,
11999 and escape them */
12000 osize += squote;
12001 else
12002 quote = '"';
12003 }
12004
12005 repr = PyUnicode_New(osize, max);
12006 if (repr == NULL)
12007 return NULL;
12008 okind = PyUnicode_KIND(repr);
12009 odata = PyUnicode_DATA(repr);
12010
12011 PyUnicode_WRITE(okind, odata, 0, quote);
12012 PyUnicode_WRITE(okind, odata, osize-1, quote);
12013
12014 for (i = 0, o = 1; i < isize; i++) {
12015 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012016
12017 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 if ((ch == quote) || (ch == '\\')) {
12019 PyUnicode_WRITE(okind, odata, o++, '\\');
12020 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012021 continue;
12022 }
12023
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012025 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 PyUnicode_WRITE(okind, odata, o++, '\\');
12027 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012028 }
12029 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 PyUnicode_WRITE(okind, odata, o++, '\\');
12031 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012032 }
12033 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 PyUnicode_WRITE(okind, odata, o++, '\\');
12035 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012036 }
12037
12038 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012039 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 PyUnicode_WRITE(okind, odata, o++, '\\');
12041 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012042 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12043 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012044 }
12045
Georg Brandl559e5d72008-06-11 18:37:52 +000012046 /* Copy ASCII characters as-is */
12047 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012049 }
12050
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012052 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012053 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012054 (categories Z* and C* except ASCII space)
12055 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012057 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012058 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012063 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012064 /* Map 16-bit characters to '\uxxxx' */
12065 else if (ch <= 0xffff) {
12066 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012067 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12068 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012071 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012072 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012073 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012074 PyUnicode_WRITE(okind, odata, o++, 'U');
12075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12076 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12077 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12078 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012079 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12080 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12081 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012083 }
12084 }
12085 /* Copy characters as-is */
12086 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012088 }
12089 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012092 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012093 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094}
12095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012096PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012097 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098\n\
12099Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012100such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101arguments start and end are interpreted as in slice notation.\n\
12102\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012103Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104
12105static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012108 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012109 Py_ssize_t start;
12110 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012111 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
Jesus Ceaac451502011-04-20 17:09:23 +020012113 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12114 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 if (PyUnicode_READY(self) == -1)
12118 return NULL;
12119 if (PyUnicode_READY(substring) == -1)
12120 return NULL;
12121
Victor Stinner7931d9a2011-11-04 00:22:48 +010012122 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
12124 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 if (result == -2)
12127 return NULL;
12128
Christian Heimes217cfd12007-12-02 14:31:20 +000012129 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130}
12131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012132PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012133 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012135Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136
12137static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012140 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012141 Py_ssize_t start;
12142 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012143 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144
Jesus Ceaac451502011-04-20 17:09:23 +020012145 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12146 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (PyUnicode_READY(self) == -1)
12150 return NULL;
12151 if (PyUnicode_READY(substring) == -1)
12152 return NULL;
12153
Victor Stinner7931d9a2011-11-04 00:22:48 +010012154 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155
12156 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 if (result == -2)
12159 return NULL;
12160
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161 if (result < 0) {
12162 PyErr_SetString(PyExc_ValueError, "substring not found");
12163 return NULL;
12164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165
Christian Heimes217cfd12007-12-02 14:31:20 +000012166 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167}
12168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012169PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012172Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012173done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
12175static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012176unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012178 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 Py_UCS4 fillchar = ' ';
12180
Victor Stinnere9a29352011-10-01 02:14:59 +020012181 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012183
Benjamin Petersonbac79492012-01-14 13:34:47 -050012184 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185 return NULL;
12186
Victor Stinnerc4b49542011-12-11 22:44:26 +010012187 if (PyUnicode_GET_LENGTH(self) >= width)
12188 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
Victor Stinnerc4b49542011-12-11 22:44:26 +010012190 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191}
12192
Alexander Belopolsky40018472011-02-26 01:02:56 +000012193PyObject *
12194PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195{
12196 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012197
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198 s = PyUnicode_FromObject(s);
12199 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012200 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012201 if (sep != NULL) {
12202 sep = PyUnicode_FromObject(sep);
12203 if (sep == NULL) {
12204 Py_DECREF(s);
12205 return NULL;
12206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207 }
12208
Victor Stinner9310abb2011-10-05 00:59:23 +020012209 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210
12211 Py_DECREF(s);
12212 Py_XDECREF(sep);
12213 return result;
12214}
12215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012216PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012217 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218\n\
12219Return a list of the words in S, using sep as the\n\
12220delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012221splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012222whitespace string is a separator and empty strings are\n\
12223removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
12225static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012226unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012228 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012230 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012232 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12233 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 return NULL;
12235
12236 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012237 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012239 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012241 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242}
12243
Thomas Wouters477c8d52006-05-27 19:21:47 +000012244PyObject *
12245PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12246{
12247 PyObject* str_obj;
12248 PyObject* sep_obj;
12249 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 int kind1, kind2, kind;
12251 void *buf1 = NULL, *buf2 = NULL;
12252 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012253
12254 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012255 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012256 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012257 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012258 if (!sep_obj) {
12259 Py_DECREF(str_obj);
12260 return NULL;
12261 }
12262 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12263 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012264 Py_DECREF(str_obj);
12265 return NULL;
12266 }
12267
Victor Stinner14f8f022011-10-05 20:58:25 +020012268 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012270 kind = Py_MAX(kind1, kind2);
12271 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012273 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 if (!buf1)
12275 goto onError;
12276 buf2 = PyUnicode_DATA(sep_obj);
12277 if (kind2 != kind)
12278 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12279 if (!buf2)
12280 goto onError;
12281 len1 = PyUnicode_GET_LENGTH(str_obj);
12282 len2 = PyUnicode_GET_LENGTH(sep_obj);
12283
Benjamin Petersonead6b532011-12-20 17:23:42 -060012284 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012286 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12287 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12288 else
12289 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 break;
12291 case PyUnicode_2BYTE_KIND:
12292 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12293 break;
12294 case PyUnicode_4BYTE_KIND:
12295 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12296 break;
12297 default:
12298 assert(0);
12299 out = 0;
12300 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012301
12302 Py_DECREF(sep_obj);
12303 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 if (kind1 != kind)
12305 PyMem_Free(buf1);
12306 if (kind2 != kind)
12307 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012308
12309 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 onError:
12311 Py_DECREF(sep_obj);
12312 Py_DECREF(str_obj);
12313 if (kind1 != kind && buf1)
12314 PyMem_Free(buf1);
12315 if (kind2 != kind && buf2)
12316 PyMem_Free(buf2);
12317 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012318}
12319
12320
12321PyObject *
12322PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12323{
12324 PyObject* str_obj;
12325 PyObject* sep_obj;
12326 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 int kind1, kind2, kind;
12328 void *buf1 = NULL, *buf2 = NULL;
12329 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012330
12331 str_obj = PyUnicode_FromObject(str_in);
12332 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012333 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012334 sep_obj = PyUnicode_FromObject(sep_in);
12335 if (!sep_obj) {
12336 Py_DECREF(str_obj);
12337 return NULL;
12338 }
12339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 kind1 = PyUnicode_KIND(str_in);
12341 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012342 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 buf1 = PyUnicode_DATA(str_in);
12344 if (kind1 != kind)
12345 buf1 = _PyUnicode_AsKind(str_in, kind);
12346 if (!buf1)
12347 goto onError;
12348 buf2 = PyUnicode_DATA(sep_obj);
12349 if (kind2 != kind)
12350 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12351 if (!buf2)
12352 goto onError;
12353 len1 = PyUnicode_GET_LENGTH(str_obj);
12354 len2 = PyUnicode_GET_LENGTH(sep_obj);
12355
Benjamin Petersonead6b532011-12-20 17:23:42 -060012356 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012358 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12359 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12360 else
12361 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 break;
12363 case PyUnicode_2BYTE_KIND:
12364 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12365 break;
12366 case PyUnicode_4BYTE_KIND:
12367 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12368 break;
12369 default:
12370 assert(0);
12371 out = 0;
12372 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012373
12374 Py_DECREF(sep_obj);
12375 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 if (kind1 != kind)
12377 PyMem_Free(buf1);
12378 if (kind2 != kind)
12379 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380
12381 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 onError:
12383 Py_DECREF(sep_obj);
12384 Py_DECREF(str_obj);
12385 if (kind1 != kind && buf1)
12386 PyMem_Free(buf1);
12387 if (kind2 != kind && buf2)
12388 PyMem_Free(buf2);
12389 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012390}
12391
12392PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012393 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012394\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012395Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012396the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012397found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012398
12399static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012400unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012401{
Victor Stinner9310abb2011-10-05 00:59:23 +020012402 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012403}
12404
12405PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012406 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012407\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012408Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012409the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012410separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012411
12412static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012413unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012414{
Victor Stinner9310abb2011-10-05 00:59:23 +020012415 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012416}
12417
Alexander Belopolsky40018472011-02-26 01:02:56 +000012418PyObject *
12419PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012420{
12421 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012423 s = PyUnicode_FromObject(s);
12424 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012425 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 if (sep != NULL) {
12427 sep = PyUnicode_FromObject(sep);
12428 if (sep == NULL) {
12429 Py_DECREF(s);
12430 return NULL;
12431 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012432 }
12433
Victor Stinner9310abb2011-10-05 00:59:23 +020012434 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012435
12436 Py_DECREF(s);
12437 Py_XDECREF(sep);
12438 return result;
12439}
12440
12441PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012442 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012443\n\
12444Return a list of the words in S, using sep as the\n\
12445delimiter string, starting at the end of the string and\n\
12446working to the front. If maxsplit is given, at most maxsplit\n\
12447splits are done. If sep is not specified, any whitespace string\n\
12448is a separator.");
12449
12450static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012451unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012452{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012453 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012454 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012455 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012456
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012457 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12458 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012459 return NULL;
12460
12461 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012462 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012463 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012464 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012465 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012466 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012467}
12468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012469PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471\n\
12472Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012473Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012474is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475
12476static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012477unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012479 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012480 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012482 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12483 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484 return NULL;
12485
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012486 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487}
12488
12489static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012490PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012492 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493}
12494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012495PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012496 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497\n\
12498Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012499and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
12501static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012502unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012504 if (PyUnicode_READY(self) == -1)
12505 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012506 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507}
12508
Georg Brandlceee0772007-11-27 23:48:05 +000012509PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012511\n\
12512Return a translation table usable for str.translate().\n\
12513If there is only one argument, it must be a dictionary mapping Unicode\n\
12514ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012515Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012516If there are two arguments, they must be strings of equal length, and\n\
12517in the resulting dictionary, each character in x will be mapped to the\n\
12518character at the same position in y. If there is a third argument, it\n\
12519must be a string, whose characters will be mapped to None in the result.");
12520
12521static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012522unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012523{
12524 PyObject *x, *y = NULL, *z = NULL;
12525 PyObject *new = NULL, *key, *value;
12526 Py_ssize_t i = 0;
12527 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012528
Georg Brandlceee0772007-11-27 23:48:05 +000012529 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12530 return NULL;
12531 new = PyDict_New();
12532 if (!new)
12533 return NULL;
12534 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 int x_kind, y_kind, z_kind;
12536 void *x_data, *y_data, *z_data;
12537
Georg Brandlceee0772007-11-27 23:48:05 +000012538 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012539 if (!PyUnicode_Check(x)) {
12540 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12541 "be a string if there is a second argument");
12542 goto err;
12543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012545 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12546 "arguments must have equal length");
12547 goto err;
12548 }
12549 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 x_kind = PyUnicode_KIND(x);
12551 y_kind = PyUnicode_KIND(y);
12552 x_data = PyUnicode_DATA(x);
12553 y_data = PyUnicode_DATA(y);
12554 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12555 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012556 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012557 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012558 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012559 if (!value) {
12560 Py_DECREF(key);
12561 goto err;
12562 }
Georg Brandlceee0772007-11-27 23:48:05 +000012563 res = PyDict_SetItem(new, key, value);
12564 Py_DECREF(key);
12565 Py_DECREF(value);
12566 if (res < 0)
12567 goto err;
12568 }
12569 /* create entries for deleting chars in z */
12570 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 z_kind = PyUnicode_KIND(z);
12572 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012573 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012575 if (!key)
12576 goto err;
12577 res = PyDict_SetItem(new, key, Py_None);
12578 Py_DECREF(key);
12579 if (res < 0)
12580 goto err;
12581 }
12582 }
12583 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 int kind;
12585 void *data;
12586
Georg Brandlceee0772007-11-27 23:48:05 +000012587 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012588 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012589 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12590 "to maketrans it must be a dict");
12591 goto err;
12592 }
12593 /* copy entries into the new dict, converting string keys to int keys */
12594 while (PyDict_Next(x, &i, &key, &value)) {
12595 if (PyUnicode_Check(key)) {
12596 /* convert string keys to integer keys */
12597 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012598 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012599 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12600 "table must be of length 1");
12601 goto err;
12602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 kind = PyUnicode_KIND(key);
12604 data = PyUnicode_DATA(key);
12605 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012606 if (!newkey)
12607 goto err;
12608 res = PyDict_SetItem(new, newkey, value);
12609 Py_DECREF(newkey);
12610 if (res < 0)
12611 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012612 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012613 /* just keep integer keys */
12614 if (PyDict_SetItem(new, key, value) < 0)
12615 goto err;
12616 } else {
12617 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12618 "be strings or integers");
12619 goto err;
12620 }
12621 }
12622 }
12623 return new;
12624 err:
12625 Py_DECREF(new);
12626 return NULL;
12627}
12628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012629PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012630 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631\n\
12632Return a copy of the string S, where all characters have been mapped\n\
12633through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012634Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012635Unmapped characters are left untouched. Characters mapped to None\n\
12636are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637
12638static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642}
12643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012644PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012647Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648
12649static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012650unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012652 if (PyUnicode_READY(self) == -1)
12653 return NULL;
12654 if (PyUnicode_IS_ASCII(self))
12655 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012656 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657}
12658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012659PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012662Pad a numeric string S with zeros on the left, to fill a field\n\
12663of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664
12665static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012666unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012668 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012669 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012670 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 int kind;
12672 void *data;
12673 Py_UCS4 chr;
12674
Martin v. Löwis18e16552006-02-15 17:27:45 +000012675 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676 return NULL;
12677
Benjamin Petersonbac79492012-01-14 13:34:47 -050012678 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012679 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680
Victor Stinnerc4b49542011-12-11 22:44:26 +010012681 if (PyUnicode_GET_LENGTH(self) >= width)
12682 return unicode_result_unchanged(self);
12683
12684 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685
12686 u = pad(self, fill, 0, '0');
12687
Walter Dörwald068325e2002-04-15 13:36:47 +000012688 if (u == NULL)
12689 return NULL;
12690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 kind = PyUnicode_KIND(u);
12692 data = PyUnicode_DATA(u);
12693 chr = PyUnicode_READ(kind, data, fill);
12694
12695 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 PyUnicode_WRITE(kind, data, 0, chr);
12698 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699 }
12700
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012701 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012702 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704
12705#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012706static PyObject *
12707unicode__decimal2ascii(PyObject *self)
12708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012710}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711#endif
12712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012713PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012716Return True if S starts with the specified prefix, False otherwise.\n\
12717With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012718With optional end, stop comparing S at that position.\n\
12719prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720
12721static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012722unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012723 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012725 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012726 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012727 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012728 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012729 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730
Jesus Ceaac451502011-04-20 17:09:23 +020012731 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012733 if (PyTuple_Check(subobj)) {
12734 Py_ssize_t i;
12735 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012736 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012737 if (substring == NULL)
12738 return NULL;
12739 result = tailmatch(self, substring, start, end, -1);
12740 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012741 if (result == -1)
12742 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012743 if (result) {
12744 Py_RETURN_TRUE;
12745 }
12746 }
12747 /* nothing matched */
12748 Py_RETURN_FALSE;
12749 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012750 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012751 if (substring == NULL) {
12752 if (PyErr_ExceptionMatches(PyExc_TypeError))
12753 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12754 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012756 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012757 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012759 if (result == -1)
12760 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012761 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762}
12763
12764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012765PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012768Return True if S ends with the specified suffix, False otherwise.\n\
12769With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012770With optional end, stop comparing S at that position.\n\
12771suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772
12773static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012774unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012775 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012777 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012778 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012779 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012780 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012781 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782
Jesus Ceaac451502011-04-20 17:09:23 +020012783 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012785 if (PyTuple_Check(subobj)) {
12786 Py_ssize_t i;
12787 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012788 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012790 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012791 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012792 result = tailmatch(self, substring, start, end, +1);
12793 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012794 if (result == -1)
12795 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012796 if (result) {
12797 Py_RETURN_TRUE;
12798 }
12799 }
12800 Py_RETURN_FALSE;
12801 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012802 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012803 if (substring == NULL) {
12804 if (PyErr_ExceptionMatches(PyExc_TypeError))
12805 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12806 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012808 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012809 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010012810 if (result == -1)
12811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012813 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814}
12815
Victor Stinner202fdca2012-05-07 12:47:02 +020012816Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012817_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012818{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012819 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012820 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12821 writer->data = PyUnicode_DATA(writer->buffer);
12822 writer->kind = PyUnicode_KIND(writer->buffer);
12823}
12824
Victor Stinnerd3f08822012-05-29 12:57:52 +020012825void
12826_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012827{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012828 memset(writer, 0, sizeof(*writer));
12829#ifdef Py_DEBUG
12830 writer->kind = 5; /* invalid kind */
12831#endif
12832 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012833 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012834}
12835
Victor Stinnerd3f08822012-05-29 12:57:52 +020012836int
12837_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12838 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012839{
12840 Py_ssize_t newlen;
12841 PyObject *newbuffer;
12842
Victor Stinnerd3f08822012-05-29 12:57:52 +020012843 assert(length > 0);
12844
Victor Stinner202fdca2012-05-07 12:47:02 +020012845 if (length > PY_SSIZE_T_MAX - writer->pos) {
12846 PyErr_NoMemory();
12847 return -1;
12848 }
12849 newlen = writer->pos + length;
12850
Victor Stinnerd3f08822012-05-29 12:57:52 +020012851 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012852 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012853 /* overallocate 25% to limit the number of resize */
12854 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12855 newlen += newlen / 4;
12856 if (newlen < writer->min_length)
12857 newlen = writer->min_length;
12858 }
12859 writer->buffer = PyUnicode_New(newlen, maxchar);
12860 if (writer->buffer == NULL)
12861 return -1;
12862 _PyUnicodeWriter_Update(writer);
12863 return 0;
12864 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012865
Victor Stinnerd3f08822012-05-29 12:57:52 +020012866 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012867 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012868 /* overallocate 25% to limit the number of resize */
12869 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12870 newlen += newlen / 4;
12871 if (newlen < writer->min_length)
12872 newlen = writer->min_length;
12873 }
12874
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012875 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012876 /* resize + widen */
12877 newbuffer = PyUnicode_New(newlen, maxchar);
12878 if (newbuffer == NULL)
12879 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012880 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12881 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012882 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012883 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012884 }
12885 else {
12886 newbuffer = resize_compact(writer->buffer, newlen);
12887 if (newbuffer == NULL)
12888 return -1;
12889 }
12890 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012891 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012892 }
12893 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012894 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012895 newbuffer = PyUnicode_New(writer->size, maxchar);
12896 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012897 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012898 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12899 writer->buffer, 0, writer->pos);
12900 Py_DECREF(writer->buffer);
12901 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012902 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012903 }
12904 return 0;
12905}
12906
Victor Stinnerd3f08822012-05-29 12:57:52 +020012907int
12908_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12909{
12910 Py_UCS4 maxchar;
12911 Py_ssize_t len;
12912
12913 if (PyUnicode_READY(str) == -1)
12914 return -1;
12915 len = PyUnicode_GET_LENGTH(str);
12916 if (len == 0)
12917 return 0;
12918 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12919 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012920 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012921 Py_INCREF(str);
12922 writer->buffer = str;
12923 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012924 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012925 writer->size = 0;
12926 writer->pos += len;
12927 return 0;
12928 }
12929 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12930 return -1;
12931 }
12932 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12933 str, 0, len);
12934 writer->pos += len;
12935 return 0;
12936}
12937
Victor Stinnere215d962012-10-06 23:03:36 +020012938int
Victor Stinnercfc4c132013-04-03 01:48:39 +020012939_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
12940 Py_ssize_t start, Py_ssize_t end)
12941{
12942 Py_UCS4 maxchar;
12943 Py_ssize_t len;
12944
12945 if (PyUnicode_READY(str) == -1)
12946 return -1;
12947
12948 assert(0 <= start);
12949 assert(end <= PyUnicode_GET_LENGTH(str));
12950 assert(start <= end);
12951
12952 if (end == 0)
12953 return 0;
12954
12955 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
12956 return _PyUnicodeWriter_WriteStr(writer, str);
12957
12958 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
12959 maxchar = _PyUnicode_FindMaxChar(str, start, end);
12960 else
12961 maxchar = writer->maxchar;
12962 len = end - start;
12963
12964 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
12965 return -1;
12966
12967 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12968 str, start, len);
12969 writer->pos += len;
12970 return 0;
12971}
12972
12973int
Victor Stinnere215d962012-10-06 23:03:36 +020012974_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12975{
12976 Py_UCS4 maxchar;
12977
12978 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12979 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12980 return -1;
12981 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12982 writer->pos += len;
12983 return 0;
12984}
12985
Victor Stinnerd3f08822012-05-29 12:57:52 +020012986PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012987_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012988{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012989 if (writer->pos == 0) {
12990 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012991 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012992 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012993 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012994 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12995 return writer->buffer;
12996 }
12997 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12998 PyObject *newbuffer;
12999 newbuffer = resize_compact(writer->buffer, writer->pos);
13000 if (newbuffer == NULL) {
13001 Py_DECREF(writer->buffer);
13002 return NULL;
13003 }
13004 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013005 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020013006 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010013007 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013008}
13009
Victor Stinnerd3f08822012-05-29 12:57:52 +020013010void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013011_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013012{
13013 Py_CLEAR(writer->buffer);
13014}
13015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013017
13018PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013019 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013020\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013021Return a formatted version of S, using substitutions from args and kwargs.\n\
13022The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013023
Eric Smith27bbca62010-11-04 17:06:58 +000013024PyDoc_STRVAR(format_map__doc__,
13025 "S.format_map(mapping) -> str\n\
13026\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013027Return a formatted version of S, using substitutions from mapping.\n\
13028The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013029
Eric Smith4a7d76d2008-05-30 18:10:19 +000013030static PyObject *
13031unicode__format__(PyObject* self, PyObject* args)
13032{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013033 PyObject *format_spec;
13034 _PyUnicodeWriter writer;
13035 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013036
13037 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13038 return NULL;
13039
Victor Stinnerd3f08822012-05-29 12:57:52 +020013040 if (PyUnicode_READY(self) == -1)
13041 return NULL;
13042 _PyUnicodeWriter_Init(&writer, 0);
13043 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13044 self, format_spec, 0,
13045 PyUnicode_GET_LENGTH(format_spec));
13046 if (ret == -1) {
13047 _PyUnicodeWriter_Dealloc(&writer);
13048 return NULL;
13049 }
13050 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013051}
13052
Eric Smith8c663262007-08-25 02:26:07 +000013053PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013054 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013055\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013056Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013057
13058static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013059unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013060{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 Py_ssize_t size;
13062
13063 /* If it's a compact object, account for base structure +
13064 character data. */
13065 if (PyUnicode_IS_COMPACT_ASCII(v))
13066 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13067 else if (PyUnicode_IS_COMPACT(v))
13068 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013069 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 else {
13071 /* If it is a two-block object, account for base object, and
13072 for character block if present. */
13073 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013074 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013075 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013076 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 }
13078 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013079 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013080 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013082 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013083 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013084
13085 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013086}
13087
13088PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013089 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013090
13091static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013092unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013093{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013094 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013095 if (!copy)
13096 return NULL;
13097 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013098}
13099
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013101 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013102 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013103 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13104 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013105 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13106 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013107 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013108 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13109 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13110 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13111 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13112 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013113 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013114 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13115 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13116 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013117 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013118 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13119 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13120 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013121 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013122 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013123 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013124 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013125 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13126 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13127 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13128 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13129 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13130 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13131 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13132 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13133 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13134 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13135 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13136 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13137 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13138 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013139 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013140 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013141 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013142 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013143 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013144 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013145 {"maketrans", (PyCFunction) unicode_maketrans,
13146 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013147 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013148#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013149 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013150 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151#endif
13152
Benjamin Peterson14339b62009-01-31 16:36:08 +000013153 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154 {NULL, NULL}
13155};
13156
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013157static PyObject *
13158unicode_mod(PyObject *v, PyObject *w)
13159{
Brian Curtindfc80e32011-08-10 20:28:54 -050013160 if (!PyUnicode_Check(v))
13161 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013162 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013163}
13164
13165static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 0, /*nb_add*/
13167 0, /*nb_subtract*/
13168 0, /*nb_multiply*/
13169 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013170};
13171
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 (lenfunc) unicode_length, /* sq_length */
13174 PyUnicode_Concat, /* sq_concat */
13175 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13176 (ssizeargfunc) unicode_getitem, /* sq_item */
13177 0, /* sq_slice */
13178 0, /* sq_ass_item */
13179 0, /* sq_ass_slice */
13180 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181};
13182
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013183static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013184unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013185{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013186 if (PyUnicode_READY(self) == -1)
13187 return NULL;
13188
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013189 if (PyIndex_Check(item)) {
13190 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013191 if (i == -1 && PyErr_Occurred())
13192 return NULL;
13193 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013195 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013196 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013197 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013198 PyObject *result;
13199 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013200 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013201 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013204 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013205 return NULL;
13206 }
13207
13208 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013209 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013210 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013211 slicelength == PyUnicode_GET_LENGTH(self)) {
13212 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013213 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013214 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013215 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013216 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013217 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013218 src_kind = PyUnicode_KIND(self);
13219 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013220 if (!PyUnicode_IS_ASCII(self)) {
13221 kind_limit = kind_maxchar_limit(src_kind);
13222 max_char = 0;
13223 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13224 ch = PyUnicode_READ(src_kind, src_data, cur);
13225 if (ch > max_char) {
13226 max_char = ch;
13227 if (max_char >= kind_limit)
13228 break;
13229 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013230 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013231 }
Victor Stinner55c99112011-10-13 01:17:06 +020013232 else
13233 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013234 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013235 if (result == NULL)
13236 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013237 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013238 dest_data = PyUnicode_DATA(result);
13239
13240 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013241 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13242 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013243 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013244 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013245 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013246 } else {
13247 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13248 return NULL;
13249 }
13250}
13251
13252static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013253 (lenfunc)unicode_length, /* mp_length */
13254 (binaryfunc)unicode_subscript, /* mp_subscript */
13255 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013256};
13257
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258
Guido van Rossumd57fd912000-03-10 22:53:23 +000013259/* Helpers for PyUnicode_Format() */
13260
Victor Stinnera47082312012-10-04 02:19:54 +020013261struct unicode_formatter_t {
13262 PyObject *args;
13263 int args_owned;
13264 Py_ssize_t arglen, argidx;
13265 PyObject *dict;
13266
13267 enum PyUnicode_Kind fmtkind;
13268 Py_ssize_t fmtcnt, fmtpos;
13269 void *fmtdata;
13270 PyObject *fmtstr;
13271
13272 _PyUnicodeWriter writer;
13273};
13274
13275struct unicode_format_arg_t {
13276 Py_UCS4 ch;
13277 int flags;
13278 Py_ssize_t width;
13279 int prec;
13280 int sign;
13281};
13282
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013284unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285{
Victor Stinnera47082312012-10-04 02:19:54 +020013286 Py_ssize_t argidx = ctx->argidx;
13287
13288 if (argidx < ctx->arglen) {
13289 ctx->argidx++;
13290 if (ctx->arglen < 0)
13291 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 else
Victor Stinnera47082312012-10-04 02:19:54 +020013293 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294 }
13295 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013296 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297 return NULL;
13298}
13299
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013300/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301
Victor Stinnera47082312012-10-04 02:19:54 +020013302/* Format a float into the writer if the writer is not NULL, or into *p_output
13303 otherwise.
13304
13305 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013306static int
Victor Stinnera47082312012-10-04 02:19:54 +020013307formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13308 PyObject **p_output,
13309 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013311 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013313 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013314 int prec;
13315 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013316
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317 x = PyFloat_AsDouble(v);
13318 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013319 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013320
Victor Stinnera47082312012-10-04 02:19:54 +020013321 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013322 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013323 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013324
Victor Stinnera47082312012-10-04 02:19:54 +020013325 if (arg->flags & F_ALT)
13326 dtoa_flags = Py_DTSF_ALT;
13327 else
13328 dtoa_flags = 0;
13329 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013330 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013331 return -1;
13332 len = strlen(p);
13333 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013334 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13335 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013336 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013337 }
Victor Stinner184252a2012-06-16 02:57:41 +020013338 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013339 writer->pos += len;
13340 }
13341 else
13342 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013343 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013344 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345}
13346
Victor Stinnerd0880d52012-04-27 23:40:13 +020013347/* formatlong() emulates the format codes d, u, o, x and X, and
13348 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13349 * Python's regular ints.
13350 * Return value: a new PyUnicodeObject*, or NULL if error.
13351 * The output string is of the form
13352 * "-"? ("0x" | "0X")? digit+
13353 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13354 * set in flags. The case of hex digits will be correct,
13355 * There will be at least prec digits, zero-filled on the left if
13356 * necessary to get that many.
13357 * val object to be converted
13358 * flags bitmask of format flags; only F_ALT is looked at
13359 * prec minimum number of digits; 0-fill on left if needed
13360 * type a character in [duoxX]; u acts the same as d
13361 *
13362 * CAUTION: o, x and X conversions on regular ints can never
13363 * produce a '-' sign, but can for Python's unbounded ints.
13364 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013365static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013366formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013367{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013368 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013369 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013370 Py_ssize_t i;
13371 int sign; /* 1 if '-', else 0 */
13372 int len; /* number of characters */
13373 Py_ssize_t llen;
13374 int numdigits; /* len == numnondigits + numdigits */
13375 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013376 int prec = arg->prec;
13377 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013378
Victor Stinnerd0880d52012-04-27 23:40:13 +020013379 /* Avoid exceeding SSIZE_T_MAX */
13380 if (prec > INT_MAX-3) {
13381 PyErr_SetString(PyExc_OverflowError,
13382 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013383 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013384 }
13385
13386 assert(PyLong_Check(val));
13387
13388 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013389 default:
13390 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013391 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013392 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013393 case 'u':
13394 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013395 if (PyBool_Check(val))
13396 result = PyNumber_ToBase(val, 10);
13397 else
13398 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013399 break;
13400 case 'o':
13401 numnondigits = 2;
13402 result = PyNumber_ToBase(val, 8);
13403 break;
13404 case 'x':
13405 case 'X':
13406 numnondigits = 2;
13407 result = PyNumber_ToBase(val, 16);
13408 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013409 }
13410 if (!result)
13411 return NULL;
13412
13413 assert(unicode_modifiable(result));
13414 assert(PyUnicode_IS_READY(result));
13415 assert(PyUnicode_IS_ASCII(result));
13416
13417 /* To modify the string in-place, there can only be one reference. */
13418 if (Py_REFCNT(result) != 1) {
13419 PyErr_BadInternalCall();
13420 return NULL;
13421 }
13422 buf = PyUnicode_DATA(result);
13423 llen = PyUnicode_GET_LENGTH(result);
13424 if (llen > INT_MAX) {
13425 PyErr_SetString(PyExc_ValueError,
13426 "string too large in _PyBytes_FormatLong");
13427 return NULL;
13428 }
13429 len = (int)llen;
13430 sign = buf[0] == '-';
13431 numnondigits += sign;
13432 numdigits = len - numnondigits;
13433 assert(numdigits > 0);
13434
13435 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013436 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013437 (type == 'o' || type == 'x' || type == 'X'))) {
13438 assert(buf[sign] == '0');
13439 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13440 buf[sign+1] == 'o');
13441 numnondigits -= 2;
13442 buf += 2;
13443 len -= 2;
13444 if (sign)
13445 buf[0] = '-';
13446 assert(len == numnondigits + numdigits);
13447 assert(numdigits > 0);
13448 }
13449
13450 /* Fill with leading zeroes to meet minimum width. */
13451 if (prec > numdigits) {
13452 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13453 numnondigits + prec);
13454 char *b1;
13455 if (!r1) {
13456 Py_DECREF(result);
13457 return NULL;
13458 }
13459 b1 = PyBytes_AS_STRING(r1);
13460 for (i = 0; i < numnondigits; ++i)
13461 *b1++ = *buf++;
13462 for (i = 0; i < prec - numdigits; i++)
13463 *b1++ = '0';
13464 for (i = 0; i < numdigits; i++)
13465 *b1++ = *buf++;
13466 *b1 = '\0';
13467 Py_DECREF(result);
13468 result = r1;
13469 buf = PyBytes_AS_STRING(result);
13470 len = numnondigits + prec;
13471 }
13472
13473 /* Fix up case for hex conversions. */
13474 if (type == 'X') {
13475 /* Need to convert all lower case letters to upper case.
13476 and need to convert 0x to 0X (and -0x to -0X). */
13477 for (i = 0; i < len; i++)
13478 if (buf[i] >= 'a' && buf[i] <= 'x')
13479 buf[i] -= 'a'-'A';
13480 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013481 if (!PyUnicode_Check(result)
13482 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013483 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013484 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013485 Py_DECREF(result);
13486 result = unicode;
13487 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013488 else if (len != PyUnicode_GET_LENGTH(result)) {
13489 if (PyUnicode_Resize(&result, len) < 0)
13490 Py_CLEAR(result);
13491 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013492 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013493}
13494
Victor Stinner621ef3d2012-10-02 00:33:47 +020013495/* Format an integer.
13496 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013497 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013498 * -1 and raise an exception on error */
13499static int
Victor Stinnera47082312012-10-04 02:19:54 +020013500mainformatlong(PyObject *v,
13501 struct unicode_format_arg_t *arg,
13502 PyObject **p_output,
13503 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013504{
13505 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013506 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013507
13508 if (!PyNumber_Check(v))
13509 goto wrongtype;
13510
13511 if (!PyLong_Check(v)) {
13512 iobj = PyNumber_Long(v);
13513 if (iobj == NULL) {
13514 if (PyErr_ExceptionMatches(PyExc_TypeError))
13515 goto wrongtype;
13516 return -1;
13517 }
13518 assert(PyLong_Check(iobj));
13519 }
13520 else {
13521 iobj = v;
13522 Py_INCREF(iobj);
13523 }
13524
13525 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013526 && arg->width == -1 && arg->prec == -1
13527 && !(arg->flags & (F_SIGN | F_BLANK))
13528 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013529 {
13530 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013531 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013532 int base;
13533
Victor Stinnera47082312012-10-04 02:19:54 +020013534 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013535 {
13536 default:
13537 assert(0 && "'type' not in [diuoxX]");
13538 case 'd':
13539 case 'i':
13540 case 'u':
13541 base = 10;
13542 break;
13543 case 'o':
13544 base = 8;
13545 break;
13546 case 'x':
13547 case 'X':
13548 base = 16;
13549 break;
13550 }
13551
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013552 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13553 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013554 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013555 }
13556 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013557 return 1;
13558 }
13559
Victor Stinnera47082312012-10-04 02:19:54 +020013560 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013561 Py_DECREF(iobj);
13562 if (res == NULL)
13563 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013564 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013565 return 0;
13566
13567wrongtype:
13568 PyErr_Format(PyExc_TypeError,
13569 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013570 "not %.200s",
13571 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013572 return -1;
13573}
13574
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013575static Py_UCS4
13576formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013577{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013578 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013579 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013580 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013581 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 goto onError;
13584 }
13585 else {
13586 /* Integer input truncated to a character */
13587 long x;
13588 x = PyLong_AsLong(v);
13589 if (x == -1 && PyErr_Occurred())
13590 goto onError;
13591
Victor Stinner8faf8212011-12-08 22:14:11 +010013592 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013593 PyErr_SetString(PyExc_OverflowError,
13594 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013595 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 }
13597
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013598 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013599 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013600
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013602 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013604 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013605}
13606
Victor Stinnera47082312012-10-04 02:19:54 +020013607/* Parse options of an argument: flags, width, precision.
13608 Handle also "%(name)" syntax.
13609
13610 Return 0 if the argument has been formatted into arg->str.
13611 Return 1 if the argument has been written into ctx->writer,
13612 Raise an exception and return -1 on error. */
13613static int
13614unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13615 struct unicode_format_arg_t *arg)
13616{
13617#define FORMAT_READ(ctx) \
13618 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13619
13620 PyObject *v;
13621
Victor Stinnera47082312012-10-04 02:19:54 +020013622 if (arg->ch == '(') {
13623 /* Get argument value from a dictionary. Example: "%(name)s". */
13624 Py_ssize_t keystart;
13625 Py_ssize_t keylen;
13626 PyObject *key;
13627 int pcount = 1;
13628
13629 if (ctx->dict == NULL) {
13630 PyErr_SetString(PyExc_TypeError,
13631 "format requires a mapping");
13632 return -1;
13633 }
13634 ++ctx->fmtpos;
13635 --ctx->fmtcnt;
13636 keystart = ctx->fmtpos;
13637 /* Skip over balanced parentheses */
13638 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13639 arg->ch = FORMAT_READ(ctx);
13640 if (arg->ch == ')')
13641 --pcount;
13642 else if (arg->ch == '(')
13643 ++pcount;
13644 ctx->fmtpos++;
13645 }
13646 keylen = ctx->fmtpos - keystart - 1;
13647 if (ctx->fmtcnt < 0 || pcount > 0) {
13648 PyErr_SetString(PyExc_ValueError,
13649 "incomplete format key");
13650 return -1;
13651 }
13652 key = PyUnicode_Substring(ctx->fmtstr,
13653 keystart, keystart + keylen);
13654 if (key == NULL)
13655 return -1;
13656 if (ctx->args_owned) {
13657 Py_DECREF(ctx->args);
13658 ctx->args_owned = 0;
13659 }
13660 ctx->args = PyObject_GetItem(ctx->dict, key);
13661 Py_DECREF(key);
13662 if (ctx->args == NULL)
13663 return -1;
13664 ctx->args_owned = 1;
13665 ctx->arglen = -1;
13666 ctx->argidx = -2;
13667 }
13668
13669 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013670 while (--ctx->fmtcnt >= 0) {
13671 arg->ch = FORMAT_READ(ctx);
13672 ctx->fmtpos++;
13673 switch (arg->ch) {
13674 case '-': arg->flags |= F_LJUST; continue;
13675 case '+': arg->flags |= F_SIGN; continue;
13676 case ' ': arg->flags |= F_BLANK; continue;
13677 case '#': arg->flags |= F_ALT; continue;
13678 case '0': arg->flags |= F_ZERO; continue;
13679 }
13680 break;
13681 }
13682
13683 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013684 if (arg->ch == '*') {
13685 v = unicode_format_getnextarg(ctx);
13686 if (v == NULL)
13687 return -1;
13688 if (!PyLong_Check(v)) {
13689 PyErr_SetString(PyExc_TypeError,
13690 "* wants int");
13691 return -1;
13692 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013693 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013694 if (arg->width == -1 && PyErr_Occurred())
13695 return -1;
13696 if (arg->width < 0) {
13697 arg->flags |= F_LJUST;
13698 arg->width = -arg->width;
13699 }
13700 if (--ctx->fmtcnt >= 0) {
13701 arg->ch = FORMAT_READ(ctx);
13702 ctx->fmtpos++;
13703 }
13704 }
13705 else if (arg->ch >= '0' && arg->ch <= '9') {
13706 arg->width = arg->ch - '0';
13707 while (--ctx->fmtcnt >= 0) {
13708 arg->ch = FORMAT_READ(ctx);
13709 ctx->fmtpos++;
13710 if (arg->ch < '0' || arg->ch > '9')
13711 break;
13712 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13713 mixing signed and unsigned comparison. Since arg->ch is between
13714 '0' and '9', casting to int is safe. */
13715 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13716 PyErr_SetString(PyExc_ValueError,
13717 "width too big");
13718 return -1;
13719 }
13720 arg->width = arg->width*10 + (arg->ch - '0');
13721 }
13722 }
13723
13724 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013725 if (arg->ch == '.') {
13726 arg->prec = 0;
13727 if (--ctx->fmtcnt >= 0) {
13728 arg->ch = FORMAT_READ(ctx);
13729 ctx->fmtpos++;
13730 }
13731 if (arg->ch == '*') {
13732 v = unicode_format_getnextarg(ctx);
13733 if (v == NULL)
13734 return -1;
13735 if (!PyLong_Check(v)) {
13736 PyErr_SetString(PyExc_TypeError,
13737 "* wants int");
13738 return -1;
13739 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013740 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013741 if (arg->prec == -1 && PyErr_Occurred())
13742 return -1;
13743 if (arg->prec < 0)
13744 arg->prec = 0;
13745 if (--ctx->fmtcnt >= 0) {
13746 arg->ch = FORMAT_READ(ctx);
13747 ctx->fmtpos++;
13748 }
13749 }
13750 else if (arg->ch >= '0' && arg->ch <= '9') {
13751 arg->prec = arg->ch - '0';
13752 while (--ctx->fmtcnt >= 0) {
13753 arg->ch = FORMAT_READ(ctx);
13754 ctx->fmtpos++;
13755 if (arg->ch < '0' || arg->ch > '9')
13756 break;
13757 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13758 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013759 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013760 return -1;
13761 }
13762 arg->prec = arg->prec*10 + (arg->ch - '0');
13763 }
13764 }
13765 }
13766
13767 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13768 if (ctx->fmtcnt >= 0) {
13769 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13770 if (--ctx->fmtcnt >= 0) {
13771 arg->ch = FORMAT_READ(ctx);
13772 ctx->fmtpos++;
13773 }
13774 }
13775 }
13776 if (ctx->fmtcnt < 0) {
13777 PyErr_SetString(PyExc_ValueError,
13778 "incomplete format");
13779 return -1;
13780 }
13781 return 0;
13782
13783#undef FORMAT_READ
13784}
13785
13786/* Format one argument. Supported conversion specifiers:
13787
13788 - "s", "r", "a": any type
13789 - "i", "d", "u", "o", "x", "X": int
13790 - "e", "E", "f", "F", "g", "G": float
13791 - "c": int or str (1 character)
13792
Victor Stinner8dbd4212012-12-04 09:30:24 +010013793 When possible, the output is written directly into the Unicode writer
13794 (ctx->writer). A string is created when padding is required.
13795
Victor Stinnera47082312012-10-04 02:19:54 +020013796 Return 0 if the argument has been formatted into *p_str,
13797 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013798 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013799static int
13800unicode_format_arg_format(struct unicode_formatter_t *ctx,
13801 struct unicode_format_arg_t *arg,
13802 PyObject **p_str)
13803{
13804 PyObject *v;
13805 _PyUnicodeWriter *writer = &ctx->writer;
13806
13807 if (ctx->fmtcnt == 0)
13808 ctx->writer.overallocate = 0;
13809
13810 if (arg->ch == '%') {
13811 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13812 return -1;
13813 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13814 writer->pos += 1;
13815 return 1;
13816 }
13817
13818 v = unicode_format_getnextarg(ctx);
13819 if (v == NULL)
13820 return -1;
13821
Victor Stinnera47082312012-10-04 02:19:54 +020013822
13823 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013824 case 's':
13825 case 'r':
13826 case 'a':
13827 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13828 /* Fast path */
13829 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13830 return -1;
13831 return 1;
13832 }
13833
13834 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13835 *p_str = v;
13836 Py_INCREF(*p_str);
13837 }
13838 else {
13839 if (arg->ch == 's')
13840 *p_str = PyObject_Str(v);
13841 else if (arg->ch == 'r')
13842 *p_str = PyObject_Repr(v);
13843 else
13844 *p_str = PyObject_ASCII(v);
13845 }
13846 break;
13847
13848 case 'i':
13849 case 'd':
13850 case 'u':
13851 case 'o':
13852 case 'x':
13853 case 'X':
13854 {
13855 int ret = mainformatlong(v, arg, p_str, writer);
13856 if (ret != 0)
13857 return ret;
13858 arg->sign = 1;
13859 break;
13860 }
13861
13862 case 'e':
13863 case 'E':
13864 case 'f':
13865 case 'F':
13866 case 'g':
13867 case 'G':
13868 if (arg->width == -1 && arg->prec == -1
13869 && !(arg->flags & (F_SIGN | F_BLANK)))
13870 {
13871 /* Fast path */
13872 if (formatfloat(v, arg, NULL, writer) == -1)
13873 return -1;
13874 return 1;
13875 }
13876
13877 arg->sign = 1;
13878 if (formatfloat(v, arg, p_str, NULL) == -1)
13879 return -1;
13880 break;
13881
13882 case 'c':
13883 {
13884 Py_UCS4 ch = formatchar(v);
13885 if (ch == (Py_UCS4) -1)
13886 return -1;
13887 if (arg->width == -1 && arg->prec == -1) {
13888 /* Fast path */
13889 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13890 return -1;
13891 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13892 writer->pos += 1;
13893 return 1;
13894 }
13895 *p_str = PyUnicode_FromOrdinal(ch);
13896 break;
13897 }
13898
13899 default:
13900 PyErr_Format(PyExc_ValueError,
13901 "unsupported format character '%c' (0x%x) "
13902 "at index %zd",
13903 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13904 (int)arg->ch,
13905 ctx->fmtpos - 1);
13906 return -1;
13907 }
13908 if (*p_str == NULL)
13909 return -1;
13910 assert (PyUnicode_Check(*p_str));
13911 return 0;
13912}
13913
13914static int
13915unicode_format_arg_output(struct unicode_formatter_t *ctx,
13916 struct unicode_format_arg_t *arg,
13917 PyObject *str)
13918{
13919 Py_ssize_t len;
13920 enum PyUnicode_Kind kind;
13921 void *pbuf;
13922 Py_ssize_t pindex;
13923 Py_UCS4 signchar;
13924 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013925 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020013926 Py_ssize_t sublen;
13927 _PyUnicodeWriter *writer = &ctx->writer;
13928 Py_UCS4 fill;
13929
13930 fill = ' ';
13931 if (arg->sign && arg->flags & F_ZERO)
13932 fill = '0';
13933
13934 if (PyUnicode_READY(str) == -1)
13935 return -1;
13936
13937 len = PyUnicode_GET_LENGTH(str);
13938 if ((arg->width == -1 || arg->width <= len)
13939 && (arg->prec == -1 || arg->prec >= len)
13940 && !(arg->flags & (F_SIGN | F_BLANK)))
13941 {
13942 /* Fast path */
13943 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13944 return -1;
13945 return 0;
13946 }
13947
13948 /* Truncate the string for "s", "r" and "a" formats
13949 if the precision is set */
13950 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13951 if (arg->prec >= 0 && len > arg->prec)
13952 len = arg->prec;
13953 }
13954
13955 /* Adjust sign and width */
13956 kind = PyUnicode_KIND(str);
13957 pbuf = PyUnicode_DATA(str);
13958 pindex = 0;
13959 signchar = '\0';
13960 if (arg->sign) {
13961 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13962 if (ch == '-' || ch == '+') {
13963 signchar = ch;
13964 len--;
13965 pindex++;
13966 }
13967 else if (arg->flags & F_SIGN)
13968 signchar = '+';
13969 else if (arg->flags & F_BLANK)
13970 signchar = ' ';
13971 else
13972 arg->sign = 0;
13973 }
13974 if (arg->width < len)
13975 arg->width = len;
13976
13977 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013978 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020013979 if (!(arg->flags & F_LJUST)) {
13980 if (arg->sign) {
13981 if ((arg->width-1) > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013982 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020013983 }
13984 else {
13985 if (arg->width > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013986 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020013987 }
13988 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013989 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
13990 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13991 maxchar = MAX_MAXCHAR(maxchar, strmaxchar);
13992 }
13993
Victor Stinnera47082312012-10-04 02:19:54 +020013994 buflen = arg->width;
13995 if (arg->sign && len == arg->width)
13996 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013997 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020013998 return -1;
13999
14000 /* Write the sign if needed */
14001 if (arg->sign) {
14002 if (fill != ' ') {
14003 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14004 writer->pos += 1;
14005 }
14006 if (arg->width > len)
14007 arg->width--;
14008 }
14009
14010 /* Write the numeric prefix for "x", "X" and "o" formats
14011 if the alternate form is used.
14012 For example, write "0x" for the "%#x" format. */
14013 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14014 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14015 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14016 if (fill != ' ') {
14017 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14018 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14019 writer->pos += 2;
14020 pindex += 2;
14021 }
14022 arg->width -= 2;
14023 if (arg->width < 0)
14024 arg->width = 0;
14025 len -= 2;
14026 }
14027
14028 /* Pad left with the fill character if needed */
14029 if (arg->width > len && !(arg->flags & F_LJUST)) {
14030 sublen = arg->width - len;
14031 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14032 writer->pos += sublen;
14033 arg->width = len;
14034 }
14035
14036 /* If padding with spaces: write sign if needed and/or numeric prefix if
14037 the alternate form is used */
14038 if (fill == ' ') {
14039 if (arg->sign) {
14040 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14041 writer->pos += 1;
14042 }
14043 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14044 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14045 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14046 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14047 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14048 writer->pos += 2;
14049 pindex += 2;
14050 }
14051 }
14052
14053 /* Write characters */
14054 if (len) {
14055 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14056 str, pindex, len);
14057 writer->pos += len;
14058 }
14059
14060 /* Pad right with the fill character if needed */
14061 if (arg->width > len) {
14062 sublen = arg->width - len;
14063 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14064 writer->pos += sublen;
14065 }
14066 return 0;
14067}
14068
14069/* Helper of PyUnicode_Format(): format one arg.
14070 Return 0 on success, raise an exception and return -1 on error. */
14071static int
14072unicode_format_arg(struct unicode_formatter_t *ctx)
14073{
14074 struct unicode_format_arg_t arg;
14075 PyObject *str;
14076 int ret;
14077
Victor Stinner8dbd4212012-12-04 09:30:24 +010014078 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14079 arg.flags = 0;
14080 arg.width = -1;
14081 arg.prec = -1;
14082 arg.sign = 0;
14083 str = NULL;
14084
Victor Stinnera47082312012-10-04 02:19:54 +020014085 ret = unicode_format_arg_parse(ctx, &arg);
14086 if (ret == -1)
14087 return -1;
14088
14089 ret = unicode_format_arg_format(ctx, &arg, &str);
14090 if (ret == -1)
14091 return -1;
14092
14093 if (ret != 1) {
14094 ret = unicode_format_arg_output(ctx, &arg, str);
14095 Py_DECREF(str);
14096 if (ret == -1)
14097 return -1;
14098 }
14099
14100 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14101 PyErr_SetString(PyExc_TypeError,
14102 "not all arguments converted during string formatting");
14103 return -1;
14104 }
14105 return 0;
14106}
14107
Alexander Belopolsky40018472011-02-26 01:02:56 +000014108PyObject *
14109PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014110{
Victor Stinnera47082312012-10-04 02:19:54 +020014111 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014112
Guido van Rossumd57fd912000-03-10 22:53:23 +000014113 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014114 PyErr_BadInternalCall();
14115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014116 }
Victor Stinnera47082312012-10-04 02:19:54 +020014117
14118 ctx.fmtstr = PyUnicode_FromObject(format);
14119 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014120 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014121 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14122 Py_DECREF(ctx.fmtstr);
14123 return NULL;
14124 }
14125 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14126 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14127 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14128 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014129
Victor Stinnera47082312012-10-04 02:19:54 +020014130 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014131
Guido van Rossumd57fd912000-03-10 22:53:23 +000014132 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014133 ctx.arglen = PyTuple_Size(args);
14134 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014135 }
14136 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014137 ctx.arglen = -1;
14138 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014139 }
Victor Stinnera47082312012-10-04 02:19:54 +020014140 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014141 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014142 ctx.dict = args;
14143 else
14144 ctx.dict = NULL;
14145 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014146
Victor Stinnera47082312012-10-04 02:19:54 +020014147 while (--ctx.fmtcnt >= 0) {
14148 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014149 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014150
14151 nonfmtpos = ctx.fmtpos++;
14152 while (ctx.fmtcnt >= 0 &&
14153 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14154 ctx.fmtpos++;
14155 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014156 }
Victor Stinnera47082312012-10-04 02:19:54 +020014157 if (ctx.fmtcnt < 0) {
14158 ctx.fmtpos--;
14159 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014160 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014161
Victor Stinnercfc4c132013-04-03 01:48:39 +020014162 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14163 nonfmtpos, ctx.fmtpos) < 0)
14164 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014165 }
14166 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014167 ctx.fmtpos++;
14168 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014169 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014170 }
14171 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014172
Victor Stinnera47082312012-10-04 02:19:54 +020014173 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014174 PyErr_SetString(PyExc_TypeError,
14175 "not all arguments converted during string formatting");
14176 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014177 }
14178
Victor Stinnera47082312012-10-04 02:19:54 +020014179 if (ctx.args_owned) {
14180 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014181 }
Victor Stinnera47082312012-10-04 02:19:54 +020014182 Py_DECREF(ctx.fmtstr);
14183 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014184
Benjamin Peterson29060642009-01-31 22:14:21 +000014185 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014186 Py_DECREF(ctx.fmtstr);
14187 _PyUnicodeWriter_Dealloc(&ctx.writer);
14188 if (ctx.args_owned) {
14189 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014190 }
14191 return NULL;
14192}
14193
Jeremy Hylton938ace62002-07-17 16:30:39 +000014194static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014195unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14196
Tim Peters6d6c1a32001-08-02 04:15:00 +000014197static PyObject *
14198unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14199{
Benjamin Peterson29060642009-01-31 22:14:21 +000014200 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014201 static char *kwlist[] = {"object", "encoding", "errors", 0};
14202 char *encoding = NULL;
14203 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014204
Benjamin Peterson14339b62009-01-31 16:36:08 +000014205 if (type != &PyUnicode_Type)
14206 return unicode_subtype_new(type, args, kwds);
14207 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014208 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014209 return NULL;
14210 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014211 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014212 if (encoding == NULL && errors == NULL)
14213 return PyObject_Str(x);
14214 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014215 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014216}
14217
Guido van Rossume023fe02001-08-30 03:12:59 +000014218static PyObject *
14219unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14220{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014221 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014222 Py_ssize_t length, char_size;
14223 int share_wstr, share_utf8;
14224 unsigned int kind;
14225 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014226
Benjamin Peterson14339b62009-01-31 16:36:08 +000014227 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014228
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014229 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014230 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014231 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014232 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014233 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014234 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014235 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014236 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014237
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014238 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014239 if (self == NULL) {
14240 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014241 return NULL;
14242 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014243 kind = PyUnicode_KIND(unicode);
14244 length = PyUnicode_GET_LENGTH(unicode);
14245
14246 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014247#ifdef Py_DEBUG
14248 _PyUnicode_HASH(self) = -1;
14249#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014250 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014251#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014252 _PyUnicode_STATE(self).interned = 0;
14253 _PyUnicode_STATE(self).kind = kind;
14254 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014255 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014256 _PyUnicode_STATE(self).ready = 1;
14257 _PyUnicode_WSTR(self) = NULL;
14258 _PyUnicode_UTF8_LENGTH(self) = 0;
14259 _PyUnicode_UTF8(self) = NULL;
14260 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014261 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014262
14263 share_utf8 = 0;
14264 share_wstr = 0;
14265 if (kind == PyUnicode_1BYTE_KIND) {
14266 char_size = 1;
14267 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14268 share_utf8 = 1;
14269 }
14270 else if (kind == PyUnicode_2BYTE_KIND) {
14271 char_size = 2;
14272 if (sizeof(wchar_t) == 2)
14273 share_wstr = 1;
14274 }
14275 else {
14276 assert(kind == PyUnicode_4BYTE_KIND);
14277 char_size = 4;
14278 if (sizeof(wchar_t) == 4)
14279 share_wstr = 1;
14280 }
14281
14282 /* Ensure we won't overflow the length. */
14283 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14284 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014285 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014287 data = PyObject_MALLOC((length + 1) * char_size);
14288 if (data == NULL) {
14289 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014290 goto onError;
14291 }
14292
Victor Stinnerc3c74152011-10-02 20:39:55 +020014293 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014294 if (share_utf8) {
14295 _PyUnicode_UTF8_LENGTH(self) = length;
14296 _PyUnicode_UTF8(self) = data;
14297 }
14298 if (share_wstr) {
14299 _PyUnicode_WSTR_LENGTH(self) = length;
14300 _PyUnicode_WSTR(self) = (wchar_t *)data;
14301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014302
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014303 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014304 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014305 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014306#ifdef Py_DEBUG
14307 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14308#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014309 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014310 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014311
14312onError:
14313 Py_DECREF(unicode);
14314 Py_DECREF(self);
14315 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014316}
14317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014318PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014319"str(object='') -> str\n\
14320str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014321\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014322Create a new string object from the given object. If encoding or\n\
14323errors is specified, then the object must expose a data buffer\n\
14324that will be decoded using the given encoding and error handler.\n\
14325Otherwise, returns the result of object.__str__() (if defined)\n\
14326or repr(object).\n\
14327encoding defaults to sys.getdefaultencoding().\n\
14328errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014329
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014330static PyObject *unicode_iter(PyObject *seq);
14331
Guido van Rossumd57fd912000-03-10 22:53:23 +000014332PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014333 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014334 "str", /* tp_name */
14335 sizeof(PyUnicodeObject), /* tp_size */
14336 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014337 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014338 (destructor)unicode_dealloc, /* tp_dealloc */
14339 0, /* tp_print */
14340 0, /* tp_getattr */
14341 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014342 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014343 unicode_repr, /* tp_repr */
14344 &unicode_as_number, /* tp_as_number */
14345 &unicode_as_sequence, /* tp_as_sequence */
14346 &unicode_as_mapping, /* tp_as_mapping */
14347 (hashfunc) unicode_hash, /* tp_hash*/
14348 0, /* tp_call*/
14349 (reprfunc) unicode_str, /* tp_str */
14350 PyObject_GenericGetAttr, /* tp_getattro */
14351 0, /* tp_setattro */
14352 0, /* tp_as_buffer */
14353 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014354 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014355 unicode_doc, /* tp_doc */
14356 0, /* tp_traverse */
14357 0, /* tp_clear */
14358 PyUnicode_RichCompare, /* tp_richcompare */
14359 0, /* tp_weaklistoffset */
14360 unicode_iter, /* tp_iter */
14361 0, /* tp_iternext */
14362 unicode_methods, /* tp_methods */
14363 0, /* tp_members */
14364 0, /* tp_getset */
14365 &PyBaseObject_Type, /* tp_base */
14366 0, /* tp_dict */
14367 0, /* tp_descr_get */
14368 0, /* tp_descr_set */
14369 0, /* tp_dictoffset */
14370 0, /* tp_init */
14371 0, /* tp_alloc */
14372 unicode_new, /* tp_new */
14373 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014374};
14375
14376/* Initialize the Unicode implementation */
14377
Victor Stinner3a50e702011-10-18 21:21:00 +020014378int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014379{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014380 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014381 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014382 0x000A, /* LINE FEED */
14383 0x000D, /* CARRIAGE RETURN */
14384 0x001C, /* FILE SEPARATOR */
14385 0x001D, /* GROUP SEPARATOR */
14386 0x001E, /* RECORD SEPARATOR */
14387 0x0085, /* NEXT LINE */
14388 0x2028, /* LINE SEPARATOR */
14389 0x2029, /* PARAGRAPH SEPARATOR */
14390 };
14391
Fred Drakee4315f52000-05-09 19:53:39 +000014392 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014393 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014394 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014395 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014396 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014397
Guido van Rossumcacfc072002-05-24 19:01:59 +000014398 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014399 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014400
14401 /* initialize the linebreak bloom filter */
14402 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014403 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014404 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014405
14406 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014407
Benjamin Petersonc4311282012-10-30 23:21:10 -040014408 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14409 Py_FatalError("Can't initialize field name iterator type");
14410
14411 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14412 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014413
Victor Stinner3a50e702011-10-18 21:21:00 +020014414#ifdef HAVE_MBCS
14415 winver.dwOSVersionInfoSize = sizeof(winver);
14416 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14417 PyErr_SetFromWindowsErr(0);
14418 return -1;
14419 }
14420#endif
14421 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014422}
14423
14424/* Finalize the Unicode implementation */
14425
Christian Heimesa156e092008-02-16 07:38:31 +000014426int
14427PyUnicode_ClearFreeList(void)
14428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014429 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014430}
14431
Guido van Rossumd57fd912000-03-10 22:53:23 +000014432void
Thomas Wouters78890102000-07-22 19:25:51 +000014433_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014434{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014435 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014436
Serhiy Storchaka05997252013-01-26 12:14:02 +020014437 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014438
Serhiy Storchaka05997252013-01-26 12:14:02 +020014439 for (i = 0; i < 256; i++)
14440 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014441 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014442 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014443}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014444
Walter Dörwald16807132007-05-25 13:52:07 +000014445void
14446PyUnicode_InternInPlace(PyObject **p)
14447{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014448 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014449 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014450#ifdef Py_DEBUG
14451 assert(s != NULL);
14452 assert(_PyUnicode_CHECK(s));
14453#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014454 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014455 return;
14456#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014457 /* If it's a subclass, we don't really know what putting
14458 it in the interned dict might do. */
14459 if (!PyUnicode_CheckExact(s))
14460 return;
14461 if (PyUnicode_CHECK_INTERNED(s))
14462 return;
14463 if (interned == NULL) {
14464 interned = PyDict_New();
14465 if (interned == NULL) {
14466 PyErr_Clear(); /* Don't leave an exception */
14467 return;
14468 }
14469 }
14470 /* It might be that the GetItem call fails even
14471 though the key is present in the dictionary,
14472 namely when this happens during a stack overflow. */
14473 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014474 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014475 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014476
Benjamin Peterson29060642009-01-31 22:14:21 +000014477 if (t) {
14478 Py_INCREF(t);
14479 Py_DECREF(*p);
14480 *p = t;
14481 return;
14482 }
Walter Dörwald16807132007-05-25 13:52:07 +000014483
Benjamin Peterson14339b62009-01-31 16:36:08 +000014484 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014485 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014486 PyErr_Clear();
14487 PyThreadState_GET()->recursion_critical = 0;
14488 return;
14489 }
14490 PyThreadState_GET()->recursion_critical = 0;
14491 /* The two references in interned are not counted by refcnt.
14492 The deallocator will take care of this */
14493 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014494 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014495}
14496
14497void
14498PyUnicode_InternImmortal(PyObject **p)
14499{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014500 PyUnicode_InternInPlace(p);
14501 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014502 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014503 Py_INCREF(*p);
14504 }
Walter Dörwald16807132007-05-25 13:52:07 +000014505}
14506
14507PyObject *
14508PyUnicode_InternFromString(const char *cp)
14509{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014510 PyObject *s = PyUnicode_FromString(cp);
14511 if (s == NULL)
14512 return NULL;
14513 PyUnicode_InternInPlace(&s);
14514 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014515}
14516
Alexander Belopolsky40018472011-02-26 01:02:56 +000014517void
14518_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014519{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014520 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014521 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014522 Py_ssize_t i, n;
14523 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014524
Benjamin Peterson14339b62009-01-31 16:36:08 +000014525 if (interned == NULL || !PyDict_Check(interned))
14526 return;
14527 keys = PyDict_Keys(interned);
14528 if (keys == NULL || !PyList_Check(keys)) {
14529 PyErr_Clear();
14530 return;
14531 }
Walter Dörwald16807132007-05-25 13:52:07 +000014532
Benjamin Peterson14339b62009-01-31 16:36:08 +000014533 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14534 detector, interned unicode strings are not forcibly deallocated;
14535 rather, we give them their stolen references back, and then clear
14536 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014537
Benjamin Peterson14339b62009-01-31 16:36:08 +000014538 n = PyList_GET_SIZE(keys);
14539 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014540 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014541 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014542 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014543 if (PyUnicode_READY(s) == -1) {
14544 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014545 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014547 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014548 case SSTATE_NOT_INTERNED:
14549 /* XXX Shouldn't happen */
14550 break;
14551 case SSTATE_INTERNED_IMMORTAL:
14552 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014553 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014554 break;
14555 case SSTATE_INTERNED_MORTAL:
14556 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014557 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014558 break;
14559 default:
14560 Py_FatalError("Inconsistent interned string state.");
14561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014562 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014563 }
14564 fprintf(stderr, "total size of all interned strings: "
14565 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14566 "mortal/immortal\n", mortal_size, immortal_size);
14567 Py_DECREF(keys);
14568 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014569 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014570}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014571
14572
14573/********************* Unicode Iterator **************************/
14574
14575typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014576 PyObject_HEAD
14577 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014578 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014579} unicodeiterobject;
14580
14581static void
14582unicodeiter_dealloc(unicodeiterobject *it)
14583{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014584 _PyObject_GC_UNTRACK(it);
14585 Py_XDECREF(it->it_seq);
14586 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014587}
14588
14589static int
14590unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14591{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014592 Py_VISIT(it->it_seq);
14593 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014594}
14595
14596static PyObject *
14597unicodeiter_next(unicodeiterobject *it)
14598{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014599 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014600
Benjamin Peterson14339b62009-01-31 16:36:08 +000014601 assert(it != NULL);
14602 seq = it->it_seq;
14603 if (seq == NULL)
14604 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014605 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014607 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14608 int kind = PyUnicode_KIND(seq);
14609 void *data = PyUnicode_DATA(seq);
14610 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14611 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014612 if (item != NULL)
14613 ++it->it_index;
14614 return item;
14615 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014616
Benjamin Peterson14339b62009-01-31 16:36:08 +000014617 Py_DECREF(seq);
14618 it->it_seq = NULL;
14619 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014620}
14621
14622static PyObject *
14623unicodeiter_len(unicodeiterobject *it)
14624{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014625 Py_ssize_t len = 0;
14626 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014627 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014628 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014629}
14630
14631PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14632
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014633static PyObject *
14634unicodeiter_reduce(unicodeiterobject *it)
14635{
14636 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014637 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014638 it->it_seq, it->it_index);
14639 } else {
14640 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14641 if (u == NULL)
14642 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014643 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014644 }
14645}
14646
14647PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14648
14649static PyObject *
14650unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14651{
14652 Py_ssize_t index = PyLong_AsSsize_t(state);
14653 if (index == -1 && PyErr_Occurred())
14654 return NULL;
14655 if (index < 0)
14656 index = 0;
14657 it->it_index = index;
14658 Py_RETURN_NONE;
14659}
14660
14661PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14662
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014663static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014664 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014665 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014666 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14667 reduce_doc},
14668 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14669 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014670 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014671};
14672
14673PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014674 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14675 "str_iterator", /* tp_name */
14676 sizeof(unicodeiterobject), /* tp_basicsize */
14677 0, /* tp_itemsize */
14678 /* methods */
14679 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14680 0, /* tp_print */
14681 0, /* tp_getattr */
14682 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014683 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014684 0, /* tp_repr */
14685 0, /* tp_as_number */
14686 0, /* tp_as_sequence */
14687 0, /* tp_as_mapping */
14688 0, /* tp_hash */
14689 0, /* tp_call */
14690 0, /* tp_str */
14691 PyObject_GenericGetAttr, /* tp_getattro */
14692 0, /* tp_setattro */
14693 0, /* tp_as_buffer */
14694 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14695 0, /* tp_doc */
14696 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14697 0, /* tp_clear */
14698 0, /* tp_richcompare */
14699 0, /* tp_weaklistoffset */
14700 PyObject_SelfIter, /* tp_iter */
14701 (iternextfunc)unicodeiter_next, /* tp_iternext */
14702 unicodeiter_methods, /* tp_methods */
14703 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014704};
14705
14706static PyObject *
14707unicode_iter(PyObject *seq)
14708{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014709 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014710
Benjamin Peterson14339b62009-01-31 16:36:08 +000014711 if (!PyUnicode_Check(seq)) {
14712 PyErr_BadInternalCall();
14713 return NULL;
14714 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014715 if (PyUnicode_READY(seq) == -1)
14716 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014717 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14718 if (it == NULL)
14719 return NULL;
14720 it->it_index = 0;
14721 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014722 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014723 _PyObject_GC_TRACK(it);
14724 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014725}
14726
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014727
14728size_t
14729Py_UNICODE_strlen(const Py_UNICODE *u)
14730{
14731 int res = 0;
14732 while(*u++)
14733 res++;
14734 return res;
14735}
14736
14737Py_UNICODE*
14738Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14739{
14740 Py_UNICODE *u = s1;
14741 while ((*u++ = *s2++));
14742 return s1;
14743}
14744
14745Py_UNICODE*
14746Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14747{
14748 Py_UNICODE *u = s1;
14749 while ((*u++ = *s2++))
14750 if (n-- == 0)
14751 break;
14752 return s1;
14753}
14754
14755Py_UNICODE*
14756Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14757{
14758 Py_UNICODE *u1 = s1;
14759 u1 += Py_UNICODE_strlen(u1);
14760 Py_UNICODE_strcpy(u1, s2);
14761 return s1;
14762}
14763
14764int
14765Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14766{
14767 while (*s1 && *s2 && *s1 == *s2)
14768 s1++, s2++;
14769 if (*s1 && *s2)
14770 return (*s1 < *s2) ? -1 : +1;
14771 if (*s1)
14772 return 1;
14773 if (*s2)
14774 return -1;
14775 return 0;
14776}
14777
14778int
14779Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14780{
14781 register Py_UNICODE u1, u2;
14782 for (; n != 0; n--) {
14783 u1 = *s1;
14784 u2 = *s2;
14785 if (u1 != u2)
14786 return (u1 < u2) ? -1 : +1;
14787 if (u1 == '\0')
14788 return 0;
14789 s1++;
14790 s2++;
14791 }
14792 return 0;
14793}
14794
14795Py_UNICODE*
14796Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14797{
14798 const Py_UNICODE *p;
14799 for (p = s; *p; p++)
14800 if (*p == c)
14801 return (Py_UNICODE*)p;
14802 return NULL;
14803}
14804
14805Py_UNICODE*
14806Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14807{
14808 const Py_UNICODE *p;
14809 p = s + Py_UNICODE_strlen(s);
14810 while (p != s) {
14811 p--;
14812 if (*p == c)
14813 return (Py_UNICODE*)p;
14814 }
14815 return NULL;
14816}
Victor Stinner331ea922010-08-10 16:37:20 +000014817
Victor Stinner71133ff2010-09-01 23:43:53 +000014818Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014819PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014820{
Victor Stinner577db2c2011-10-11 22:12:48 +020014821 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014822 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014824 if (!PyUnicode_Check(unicode)) {
14825 PyErr_BadArgument();
14826 return NULL;
14827 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014828 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014829 if (u == NULL)
14830 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014831 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014832 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014833 PyErr_NoMemory();
14834 return NULL;
14835 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014836 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014837 size *= sizeof(Py_UNICODE);
14838 copy = PyMem_Malloc(size);
14839 if (copy == NULL) {
14840 PyErr_NoMemory();
14841 return NULL;
14842 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014843 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014844 return copy;
14845}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014846
Georg Brandl66c221e2010-10-14 07:04:07 +000014847/* A _string module, to export formatter_parser and formatter_field_name_split
14848 to the string.Formatter class implemented in Python. */
14849
14850static PyMethodDef _string_methods[] = {
14851 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14852 METH_O, PyDoc_STR("split the argument as a field name")},
14853 {"formatter_parser", (PyCFunction) formatter_parser,
14854 METH_O, PyDoc_STR("parse the argument as a format string")},
14855 {NULL, NULL}
14856};
14857
14858static struct PyModuleDef _string_module = {
14859 PyModuleDef_HEAD_INIT,
14860 "_string",
14861 PyDoc_STR("string helper module"),
14862 0,
14863 _string_methods,
14864 NULL,
14865 NULL,
14866 NULL,
14867 NULL
14868};
14869
14870PyMODINIT_FUNC
14871PyInit__string(void)
14872{
14873 return PyModule_Create(&_string_module);
14874}
14875
14876
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014877#ifdef __cplusplus
14878}
14879#endif