blob: e38ded0fbce8c47620ab6c6fcc83a315a3bec488 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001014 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1015
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 if (ascii->wstr == data)
1017 printf("shared ");
1018 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001019
Victor Stinnera3b334d2011-10-03 13:53:37 +02001020 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(" (%zu), ", compact->wstr_length);
1022 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1023 printf("shared ");
1024 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
1373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
1375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001536 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1537 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyErr_NoMemory();
1539 return -1;
1540 }
1541 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1542 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001543 _PyUnicode_UTF8(unicode) = NULL;
1544 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001545 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1546 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001547 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 PyObject_FREE(_PyUnicode_WSTR(unicode));
1549 _PyUnicode_WSTR(unicode) = NULL;
1550 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1551#else
1552 assert(num_surrogates == 0);
1553
Victor Stinnerc3c74152011-10-02 20:39:55 +02001554 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001556 _PyUnicode_UTF8(unicode) = NULL;
1557 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1559#endif
1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1561 }
1562 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001563 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 return 0;
1565}
1566
Alexander Belopolsky40018472011-02-26 01:02:56 +00001567static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001568unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569{
Walter Dörwald16807132007-05-25 13:52:07 +00001570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 case SSTATE_NOT_INTERNED:
1572 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 case SSTATE_INTERNED_MORTAL:
1575 /* revive dead object temporarily for DelItem */
1576 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001577 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 Py_FatalError(
1579 "deletion of interned string failed");
1580 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_INTERNED_IMMORTAL:
1583 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 default:
1586 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001587 }
1588
Victor Stinner03490912011-10-03 23:45:12 +02001589 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001591 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001592 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001593 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1594 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001596 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597}
1598
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001599#ifdef Py_DEBUG
1600static int
1601unicode_is_singleton(PyObject *unicode)
1602{
1603 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1604 if (unicode == unicode_empty)
1605 return 1;
1606 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1607 {
1608 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1609 if (ch < 256 && unicode_latin1[ch] == unicode)
1610 return 1;
1611 }
1612 return 0;
1613}
1614#endif
1615
Alexander Belopolsky40018472011-02-26 01:02:56 +00001616static int
Victor Stinner488fa492011-12-12 00:01:39 +01001617unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618{
Victor Stinner488fa492011-12-12 00:01:39 +01001619 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (Py_REFCNT(unicode) != 1)
1621 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001622 if (_PyUnicode_HASH(unicode) != -1)
1623 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (PyUnicode_CHECK_INTERNED(unicode))
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (!PyUnicode_CheckExact(unicode))
1627 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001628#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001629 /* singleton refcount is greater than 1 */
1630 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001631#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001632 return 1;
1633}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635static int
1636unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1637{
1638 PyObject *unicode;
1639 Py_ssize_t old_length;
1640
1641 assert(p_unicode != NULL);
1642 unicode = *p_unicode;
1643
1644 assert(unicode != NULL);
1645 assert(PyUnicode_Check(unicode));
1646 assert(0 <= length);
1647
Victor Stinner910337b2011-10-03 03:20:16 +02001648 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001649 old_length = PyUnicode_WSTR_LENGTH(unicode);
1650 else
1651 old_length = PyUnicode_GET_LENGTH(unicode);
1652 if (old_length == length)
1653 return 0;
1654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001656 _Py_INCREF_UNICODE_EMPTY();
1657 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001658 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 Py_DECREF(*p_unicode);
1660 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 return 0;
1662 }
1663
Victor Stinner488fa492011-12-12 00:01:39 +01001664 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 PyObject *copy = resize_copy(unicode, length);
1666 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001667 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 Py_DECREF(*p_unicode);
1669 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
1672
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001674 PyObject *new_unicode = resize_compact(unicode, length);
1675 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001677 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001679 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001680 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001681}
1682
Alexander Belopolsky40018472011-02-26 01:02:56 +00001683int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001684PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001685{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686 PyObject *unicode;
1687 if (p_unicode == NULL) {
1688 PyErr_BadInternalCall();
1689 return -1;
1690 }
1691 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693 {
1694 PyErr_BadInternalCall();
1695 return -1;
1696 }
1697 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001698}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001699
Victor Stinnerc5166102012-02-22 13:55:02 +01001700/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001701
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001702 WARNING: The function doesn't copy the terminating null character and
1703 doesn't check the maximum character (may write a latin1 character in an
1704 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001705static void
1706unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1707 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001708{
1709 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1710 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001711 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001712
1713 switch (kind) {
1714 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001715 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001716#ifdef Py_DEBUG
1717 if (PyUnicode_IS_ASCII(unicode)) {
1718 Py_UCS4 maxchar = ucs1lib_find_max_char(
1719 (const Py_UCS1*)str,
1720 (const Py_UCS1*)str + len);
1721 assert(maxchar < 128);
1722 }
1723#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001724 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001725 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001726 }
1727 case PyUnicode_2BYTE_KIND: {
1728 Py_UCS2 *start = (Py_UCS2 *)data + index;
1729 Py_UCS2 *ucs2 = start;
1730 assert(index <= PyUnicode_GET_LENGTH(unicode));
1731
Victor Stinner184252a2012-06-16 02:57:41 +02001732 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 *ucs2 = (Py_UCS2)*str;
1734
1735 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001736 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 }
1738 default: {
1739 Py_UCS4 *start = (Py_UCS4 *)data + index;
1740 Py_UCS4 *ucs4 = start;
1741 assert(kind == PyUnicode_4BYTE_KIND);
1742 assert(index <= PyUnicode_GET_LENGTH(unicode));
1743
Victor Stinner184252a2012-06-16 02:57:41 +02001744 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001745 *ucs4 = (Py_UCS4)*str;
1746
1747 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001748 }
1749 }
1750}
1751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752static PyObject*
1753get_latin1_char(unsigned char ch)
1754{
Victor Stinnera464fc12011-10-02 20:39:30 +02001755 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode)
1759 return NULL;
1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 unicode_latin1[ch] = unicode;
1763 }
1764 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001765 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Victor Stinner985a82a2014-01-03 12:53:47 +01001768static PyObject*
1769unicode_char(Py_UCS4 ch)
1770{
1771 PyObject *unicode;
1772
1773 assert(ch <= MAX_UNICODE);
1774
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001775 if (ch < 256)
1776 return get_latin1_char(ch);
1777
Victor Stinner985a82a2014-01-03 12:53:47 +01001778 unicode = PyUnicode_New(1, ch);
1779 if (unicode == NULL)
1780 return NULL;
1781 switch (PyUnicode_KIND(unicode)) {
1782 case PyUnicode_1BYTE_KIND:
1783 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1784 break;
1785 case PyUnicode_2BYTE_KIND:
1786 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1787 break;
1788 default:
1789 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1790 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1791 }
1792 assert(_PyUnicode_CheckConsistency(unicode, 1));
1793 return unicode;
1794}
1795
Alexander Belopolsky40018472011-02-26 01:02:56 +00001796PyObject *
1797PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001799 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 Py_UCS4 maxchar = 0;
1801 Py_ssize_t num_surrogates;
1802
1803 if (u == NULL)
1804 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001806 /* If the Unicode data is known at construction time, we can apply
1807 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001810 if (size == 0)
1811 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Single character Unicode objects in the Latin-1 range are
1814 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001815 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 return get_latin1_char((unsigned char)*u);
1817
1818 /* If not empty and not single character, copy the Unicode data
1819 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 if (find_maxchar_surrogates(u, u + size,
1821 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 return NULL;
1823
Victor Stinner8faf8212011-12-08 22:14:11 +01001824 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 if (!unicode)
1826 return NULL;
1827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 switch (PyUnicode_KIND(unicode)) {
1829 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001830 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1832 break;
1833 case PyUnicode_2BYTE_KIND:
1834#if Py_UNICODE_SIZE == 2
1835 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1836#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001837 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1839#endif
1840 break;
1841 case PyUnicode_4BYTE_KIND:
1842#if SIZEOF_WCHAR_T == 2
1843 /* This is the only case which has to process surrogates, thus
1844 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001845 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846#else
1847 assert(num_surrogates == 0);
1848 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1849#endif
1850 break;
1851 default:
1852 assert(0 && "Impossible state");
1853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001855 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856}
1857
Alexander Belopolsky40018472011-02-26 01:02:56 +00001858PyObject *
1859PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001860{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001861 if (size < 0) {
1862 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001864 return NULL;
1865 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001866 if (u != NULL)
1867 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1868 else
1869 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001870}
1871
Alexander Belopolsky40018472011-02-26 01:02:56 +00001872PyObject *
1873PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001874{
1875 size_t size = strlen(u);
1876 if (size > PY_SSIZE_T_MAX) {
1877 PyErr_SetString(PyExc_OverflowError, "input too long");
1878 return NULL;
1879 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001880 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001881}
1882
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001883PyObject *
1884_PyUnicode_FromId(_Py_Identifier *id)
1885{
1886 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001887 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1888 strlen(id->string),
1889 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001890 if (!id->object)
1891 return NULL;
1892 PyUnicode_InternInPlace(&id->object);
1893 assert(!id->next);
1894 id->next = static_strings;
1895 static_strings = id;
1896 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001897 return id->object;
1898}
1899
1900void
1901_PyUnicode_ClearStaticStrings()
1902{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001903 _Py_Identifier *tmp, *s = static_strings;
1904 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001905 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001906 tmp = s->next;
1907 s->next = NULL;
1908 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001909 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001911}
1912
Benjamin Peterson0df54292012-03-26 14:50:32 -04001913/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914
Victor Stinnerd3f08822012-05-29 12:57:52 +02001915PyObject*
1916_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001917{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001918 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001919 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001920 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001921#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001922 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001923#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001924 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001925 }
Victor Stinner785938e2011-12-11 20:09:03 +01001926 unicode = PyUnicode_New(size, 127);
1927 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001928 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001929 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1930 assert(_PyUnicode_CheckConsistency(unicode, 1));
1931 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001932}
1933
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001934static Py_UCS4
1935kind_maxchar_limit(unsigned int kind)
1936{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001937 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001938 case PyUnicode_1BYTE_KIND:
1939 return 0x80;
1940 case PyUnicode_2BYTE_KIND:
1941 return 0x100;
1942 case PyUnicode_4BYTE_KIND:
1943 return 0x10000;
1944 default:
1945 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001946 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001947 }
1948}
1949
Victor Stinnere6abb482012-05-02 01:15:40 +02001950Py_LOCAL_INLINE(Py_UCS4)
1951align_maxchar(Py_UCS4 maxchar)
1952{
1953 if (maxchar <= 127)
1954 return 127;
1955 else if (maxchar <= 255)
1956 return 255;
1957 else if (maxchar <= 65535)
1958 return 65535;
1959 else
1960 return MAX_UNICODE;
1961}
1962
Victor Stinner702c7342011-10-05 13:50:52 +02001963static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001964_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001967 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001968
Serhiy Storchaka678db842013-01-26 12:16:36 +02001969 if (size == 0)
1970 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001972 if (size == 1)
1973 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001975 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001976 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 if (!res)
1978 return NULL;
1979 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001982}
1983
Victor Stinnere57b1c02011-09-28 22:20:48 +02001984static PyObject*
1985_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986{
1987 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001988 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989
Serhiy Storchaka678db842013-01-26 12:16:36 +02001990 if (size == 0)
1991 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001992 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001993 if (size == 1)
1994 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001995
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001996 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001997 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 if (!res)
1999 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002000 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002002 else {
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2005 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002006 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return res;
2008}
2009
Victor Stinnere57b1c02011-09-28 22:20:48 +02002010static PyObject*
2011_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012{
2013 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002014 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002015
Serhiy Storchaka678db842013-01-26 12:16:36 +02002016 if (size == 0)
2017 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002019 if (size == 1)
2020 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002021
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002022 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002023 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 if (!res)
2025 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002026 if (max_char < 256)
2027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2028 PyUnicode_1BYTE_DATA(res));
2029 else if (max_char < 0x10000)
2030 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2031 PyUnicode_2BYTE_DATA(res));
2032 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002034 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return res;
2036}
2037
2038PyObject*
2039PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2040{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002041 if (size < 0) {
2042 PyErr_SetString(PyExc_ValueError, "size must be positive");
2043 return NULL;
2044 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002045 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002047 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002052 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002053 PyErr_SetString(PyExc_SystemError, "invalid kind");
2054 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056}
2057
Victor Stinnerece58de2012-04-23 23:36:38 +02002058Py_UCS4
2059_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2060{
2061 enum PyUnicode_Kind kind;
2062 void *startptr, *endptr;
2063
2064 assert(PyUnicode_IS_READY(unicode));
2065 assert(0 <= start);
2066 assert(end <= PyUnicode_GET_LENGTH(unicode));
2067 assert(start <= end);
2068
2069 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2070 return PyUnicode_MAX_CHAR_VALUE(unicode);
2071
2072 if (start == end)
2073 return 127;
2074
Victor Stinner94d558b2012-04-27 22:26:58 +02002075 if (PyUnicode_IS_ASCII(unicode))
2076 return 127;
2077
Victor Stinnerece58de2012-04-23 23:36:38 +02002078 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002079 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002080 endptr = (char *)startptr + end * kind;
2081 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002082 switch(kind) {
2083 case PyUnicode_1BYTE_KIND:
2084 return ucs1lib_find_max_char(startptr, endptr);
2085 case PyUnicode_2BYTE_KIND:
2086 return ucs2lib_find_max_char(startptr, endptr);
2087 case PyUnicode_4BYTE_KIND:
2088 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002090 assert(0);
2091 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002092 }
2093}
2094
Victor Stinner25a4b292011-10-06 12:31:55 +02002095/* Ensure that a string uses the most efficient storage, if it is not the
2096 case: create a new string with of the right kind. Write NULL into *p_unicode
2097 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002098static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002099unicode_adjust_maxchar(PyObject **p_unicode)
2100{
2101 PyObject *unicode, *copy;
2102 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002103 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002104 unsigned int kind;
2105
2106 assert(p_unicode != NULL);
2107 unicode = *p_unicode;
2108 assert(PyUnicode_IS_READY(unicode));
2109 if (PyUnicode_IS_ASCII(unicode))
2110 return;
2111
2112 len = PyUnicode_GET_LENGTH(unicode);
2113 kind = PyUnicode_KIND(unicode);
2114 if (kind == PyUnicode_1BYTE_KIND) {
2115 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002116 max_char = ucs1lib_find_max_char(u, u + len);
2117 if (max_char >= 128)
2118 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 }
2120 else if (kind == PyUnicode_2BYTE_KIND) {
2121 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002122 max_char = ucs2lib_find_max_char(u, u + len);
2123 if (max_char >= 256)
2124 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002125 }
2126 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002127 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002128 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 max_char = ucs4lib_find_max_char(u, u + len);
2130 if (max_char >= 0x10000)
2131 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002134 if (copy != NULL)
2135 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 Py_DECREF(unicode);
2137 *p_unicode = copy;
2138}
2139
Victor Stinner034f6cf2011-09-30 02:26:44 +02002140PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002141_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142{
Victor Stinner87af4f22011-11-21 23:03:47 +01002143 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002144 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145
Victor Stinner034f6cf2011-09-30 02:26:44 +02002146 if (!PyUnicode_Check(unicode)) {
2147 PyErr_BadInternalCall();
2148 return NULL;
2149 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002150 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002151 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152
Victor Stinner87af4f22011-11-21 23:03:47 +01002153 length = PyUnicode_GET_LENGTH(unicode);
2154 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002155 if (!copy)
2156 return NULL;
2157 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2158
Victor Stinner87af4f22011-11-21 23:03:47 +01002159 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2160 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002161 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002162 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002163}
2164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165
Victor Stinnerbc603d12011-10-02 01:00:40 +02002166/* Widen Unicode objects to larger buffers. Don't write terminating null
2167 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168
2169void*
2170_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2171{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 Py_ssize_t len;
2173 void *result;
2174 unsigned int skind;
2175
Benjamin Petersonbac79492012-01-14 13:34:47 -05002176 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002177 return NULL;
2178
2179 len = PyUnicode_GET_LENGTH(s);
2180 skind = PyUnicode_KIND(s);
2181 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002182 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 return NULL;
2184 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002185 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 case PyUnicode_2BYTE_KIND:
2187 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2188 if (!result)
2189 return PyErr_NoMemory();
2190 assert(skind == PyUnicode_1BYTE_KIND);
2191 _PyUnicode_CONVERT_BYTES(
2192 Py_UCS1, Py_UCS2,
2193 PyUnicode_1BYTE_DATA(s),
2194 PyUnicode_1BYTE_DATA(s) + len,
2195 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002197 case PyUnicode_4BYTE_KIND:
2198 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2199 if (!result)
2200 return PyErr_NoMemory();
2201 if (skind == PyUnicode_2BYTE_KIND) {
2202 _PyUnicode_CONVERT_BYTES(
2203 Py_UCS2, Py_UCS4,
2204 PyUnicode_2BYTE_DATA(s),
2205 PyUnicode_2BYTE_DATA(s) + len,
2206 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002208 else {
2209 assert(skind == PyUnicode_1BYTE_KIND);
2210 _PyUnicode_CONVERT_BYTES(
2211 Py_UCS1, Py_UCS4,
2212 PyUnicode_1BYTE_DATA(s),
2213 PyUnicode_1BYTE_DATA(s) + len,
2214 result);
2215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002217 default:
2218 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 }
Victor Stinner01698042011-10-04 00:04:26 +02002220 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 return NULL;
2222}
2223
2224static Py_UCS4*
2225as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2226 int copy_null)
2227{
2228 int kind;
2229 void *data;
2230 Py_ssize_t len, targetlen;
2231 if (PyUnicode_READY(string) == -1)
2232 return NULL;
2233 kind = PyUnicode_KIND(string);
2234 data = PyUnicode_DATA(string);
2235 len = PyUnicode_GET_LENGTH(string);
2236 targetlen = len;
2237 if (copy_null)
2238 targetlen++;
2239 if (!target) {
2240 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2241 PyErr_NoMemory();
2242 return NULL;
2243 }
2244 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2245 if (!target) {
2246 PyErr_NoMemory();
2247 return NULL;
2248 }
2249 }
2250 else {
2251 if (targetsize < targetlen) {
2252 PyErr_Format(PyExc_SystemError,
2253 "string is longer than the buffer");
2254 if (copy_null && 0 < targetsize)
2255 target[0] = 0;
2256 return NULL;
2257 }
2258 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002259 if (kind == PyUnicode_1BYTE_KIND) {
2260 Py_UCS1 *start = (Py_UCS1 *) data;
2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002263 else if (kind == PyUnicode_2BYTE_KIND) {
2264 Py_UCS2 *start = (Py_UCS2 *) data;
2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266 }
2267 else {
2268 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 if (copy_null)
2272 target[len] = 0;
2273 return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278 int copy_null)
2279{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002280 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 PyErr_BadInternalCall();
2282 return NULL;
2283 }
2284 return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290 return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002294
Alexander Belopolsky40018472011-02-26 01:02:56 +00002295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002300 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 PyErr_BadInternalCall();
2302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 }
2304
Martin v. Löwis790465f2008-04-05 20:41:37 +00002305 if (size == -1) {
2306 size = wcslen(w);
2307 }
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002313
Walter Dörwald346737f2007-05-31 10:44:43 +00002314static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002315makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002316 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002317{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002318 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 if (longflag)
2320 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002321 else if (longlongflag) {
2322 /* longlongflag should only ever be nonzero on machines with
2323 HAVE_LONG_LONG defined */
2324#ifdef HAVE_LONG_LONG
2325 char *f = PY_FORMAT_LONG_LONG;
2326 while (*f)
2327 *fmt++ = *f++;
2328#else
2329 /* we shouldn't ever get here */
2330 assert(0);
2331 *fmt++ = 'l';
2332#endif
2333 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002334 else if (size_tflag) {
2335 char *f = PY_FORMAT_SIZE_T;
2336 while (*f)
2337 *fmt++ = *f++;
2338 }
2339 *fmt++ = c;
2340 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002341}
2342
Victor Stinner15a11362012-10-06 23:48:20 +02002343/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002344 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2345 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2346#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002347
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002348static int
2349unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2350 Py_ssize_t width, Py_ssize_t precision)
2351{
2352 Py_ssize_t length, fill, arglen;
2353 Py_UCS4 maxchar;
2354
2355 if (PyUnicode_READY(str) == -1)
2356 return -1;
2357
2358 length = PyUnicode_GET_LENGTH(str);
2359 if ((precision == -1 || precision >= length)
2360 && width <= length)
2361 return _PyUnicodeWriter_WriteStr(writer, str);
2362
2363 if (precision != -1)
2364 length = Py_MIN(precision, length);
2365
2366 arglen = Py_MAX(length, width);
2367 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2368 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2369 else
2370 maxchar = writer->maxchar;
2371
2372 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2373 return -1;
2374
2375 if (width > length) {
2376 fill = width - length;
2377 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2378 return -1;
2379 writer->pos += fill;
2380 }
2381
2382 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2383 str, 0, length);
2384 writer->pos += length;
2385 return 0;
2386}
2387
2388static int
2389unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2390 Py_ssize_t width, Py_ssize_t precision)
2391{
2392 /* UTF-8 */
2393 Py_ssize_t length;
2394 PyObject *unicode;
2395 int res;
2396
2397 length = strlen(str);
2398 if (precision != -1)
2399 length = Py_MIN(length, precision);
2400 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2401 if (unicode == NULL)
2402 return -1;
2403
2404 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2405 Py_DECREF(unicode);
2406 return res;
2407}
2408
Victor Stinner96865452011-03-01 23:44:09 +00002409static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002410unicode_fromformat_arg(_PyUnicodeWriter *writer,
2411 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002412{
Victor Stinnere215d962012-10-06 23:03:36 +02002413 const char *p;
2414 Py_ssize_t len;
2415 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 Py_ssize_t width;
2417 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002418 int longflag;
2419 int longlongflag;
2420 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002421 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002422
2423 p = f;
2424 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002425 zeropad = 0;
2426 if (*f == '0') {
2427 zeropad = 1;
2428 f++;
2429 }
Victor Stinner96865452011-03-01 23:44:09 +00002430
2431 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002432 width = -1;
2433 if (Py_ISDIGIT((unsigned)*f)) {
2434 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002435 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002436 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002437 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002438 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002440 return NULL;
2441 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002442 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002443 f++;
2444 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002445 }
2446 precision = -1;
2447 if (*f == '.') {
2448 f++;
2449 if (Py_ISDIGIT((unsigned)*f)) {
2450 precision = (*f - '0');
2451 f++;
2452 while (Py_ISDIGIT((unsigned)*f)) {
2453 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2454 PyErr_SetString(PyExc_ValueError,
2455 "precision too big");
2456 return NULL;
2457 }
2458 precision = (precision * 10) + (*f - '0');
2459 f++;
2460 }
2461 }
Victor Stinner96865452011-03-01 23:44:09 +00002462 if (*f == '%') {
2463 /* "%.3%s" => f points to "3" */
2464 f--;
2465 }
2466 }
2467 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002468 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002469 f--;
2470 }
Victor Stinner96865452011-03-01 23:44:09 +00002471
2472 /* Handle %ld, %lu, %lld and %llu. */
2473 longflag = 0;
2474 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002475 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002476 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002477 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002478 longflag = 1;
2479 ++f;
2480 }
2481#ifdef HAVE_LONG_LONG
2482 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002483 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002484 longlongflag = 1;
2485 f += 2;
2486 }
2487#endif
2488 }
2489 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002490 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002491 size_tflag = 1;
2492 ++f;
2493 }
Victor Stinnere215d962012-10-06 23:03:36 +02002494
2495 if (f[1] == '\0')
2496 writer->overallocate = 0;
2497
2498 switch (*f) {
2499 case 'c':
2500 {
2501 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002502 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002503 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002504 "character argument not in range(0x110000)");
2505 return NULL;
2506 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002507 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002508 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002509 break;
2510 }
2511
2512 case 'i':
2513 case 'd':
2514 case 'u':
2515 case 'x':
2516 {
2517 /* used by sprintf */
2518 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002519 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002520 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002521
2522 if (*f == 'u') {
2523 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2524
2525 if (longflag)
2526 len = sprintf(buffer, fmt,
2527 va_arg(*vargs, unsigned long));
2528#ifdef HAVE_LONG_LONG
2529 else if (longlongflag)
2530 len = sprintf(buffer, fmt,
2531 va_arg(*vargs, unsigned PY_LONG_LONG));
2532#endif
2533 else if (size_tflag)
2534 len = sprintf(buffer, fmt,
2535 va_arg(*vargs, size_t));
2536 else
2537 len = sprintf(buffer, fmt,
2538 va_arg(*vargs, unsigned int));
2539 }
2540 else if (*f == 'x') {
2541 makefmt(fmt, 0, 0, 0, 'x');
2542 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2543 }
2544 else {
2545 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2546
2547 if (longflag)
2548 len = sprintf(buffer, fmt,
2549 va_arg(*vargs, long));
2550#ifdef HAVE_LONG_LONG
2551 else if (longlongflag)
2552 len = sprintf(buffer, fmt,
2553 va_arg(*vargs, PY_LONG_LONG));
2554#endif
2555 else if (size_tflag)
2556 len = sprintf(buffer, fmt,
2557 va_arg(*vargs, Py_ssize_t));
2558 else
2559 len = sprintf(buffer, fmt,
2560 va_arg(*vargs, int));
2561 }
2562 assert(len >= 0);
2563
Victor Stinnere215d962012-10-06 23:03:36 +02002564 if (precision < len)
2565 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002566
2567 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002568 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2569 return NULL;
2570
Victor Stinnere215d962012-10-06 23:03:36 +02002571 if (width > precision) {
2572 Py_UCS4 fillchar;
2573 fill = width - precision;
2574 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002575 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2576 return NULL;
2577 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002578 }
Victor Stinner15a11362012-10-06 23:48:20 +02002579 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002580 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002581 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2582 return NULL;
2583 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002584 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585
Victor Stinner4a587072013-11-19 12:54:53 +01002586 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2587 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002588 break;
2589 }
2590
2591 case 'p':
2592 {
2593 char number[MAX_LONG_LONG_CHARS];
2594
2595 len = sprintf(number, "%p", va_arg(*vargs, void*));
2596 assert(len >= 0);
2597
2598 /* %p is ill-defined: ensure leading 0x. */
2599 if (number[1] == 'X')
2600 number[1] = 'x';
2601 else if (number[1] != 'x') {
2602 memmove(number + 2, number,
2603 strlen(number) + 1);
2604 number[0] = '0';
2605 number[1] = 'x';
2606 len += 2;
2607 }
2608
Victor Stinner4a587072013-11-19 12:54:53 +01002609 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
2611 break;
2612 }
2613
2614 case 's':
2615 {
2616 /* UTF-8 */
2617 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002618 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002619 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002620 break;
2621 }
2622
2623 case 'U':
2624 {
2625 PyObject *obj = va_arg(*vargs, PyObject *);
2626 assert(obj && _PyUnicode_CHECK(obj));
2627
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002629 return NULL;
2630 break;
2631 }
2632
2633 case 'V':
2634 {
2635 PyObject *obj = va_arg(*vargs, PyObject *);
2636 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002637 if (obj) {
2638 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002639 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002640 return NULL;
2641 }
2642 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002643 assert(str != NULL);
2644 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002645 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002646 }
2647 break;
2648 }
2649
2650 case 'S':
2651 {
2652 PyObject *obj = va_arg(*vargs, PyObject *);
2653 PyObject *str;
2654 assert(obj);
2655 str = PyObject_Str(obj);
2656 if (!str)
2657 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002658 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002659 Py_DECREF(str);
2660 return NULL;
2661 }
2662 Py_DECREF(str);
2663 break;
2664 }
2665
2666 case 'R':
2667 {
2668 PyObject *obj = va_arg(*vargs, PyObject *);
2669 PyObject *repr;
2670 assert(obj);
2671 repr = PyObject_Repr(obj);
2672 if (!repr)
2673 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002674 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002675 Py_DECREF(repr);
2676 return NULL;
2677 }
2678 Py_DECREF(repr);
2679 break;
2680 }
2681
2682 case 'A':
2683 {
2684 PyObject *obj = va_arg(*vargs, PyObject *);
2685 PyObject *ascii;
2686 assert(obj);
2687 ascii = PyObject_ASCII(obj);
2688 if (!ascii)
2689 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002690 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002691 Py_DECREF(ascii);
2692 return NULL;
2693 }
2694 Py_DECREF(ascii);
2695 break;
2696 }
2697
2698 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002699 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002700 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002701 break;
2702
2703 default:
2704 /* if we stumble upon an unknown formatting code, copy the rest
2705 of the format string to the output string. (we cannot just
2706 skip the code, since there's no way to know what's in the
2707 argument list) */
2708 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002709 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002710 return NULL;
2711 f = p+len;
2712 return f;
2713 }
2714
2715 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002716 return f;
2717}
2718
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719PyObject *
2720PyUnicode_FromFormatV(const char *format, va_list vargs)
2721{
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_list vargs2;
2723 const char *f;
2724 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002725
Victor Stinner8f674cc2013-04-17 23:02:17 +02002726 _PyUnicodeWriter_Init(&writer);
2727 writer.min_length = strlen(format) + 100;
2728 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002729
2730 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2731 Copy it to be able to pass a reference to a subfunction. */
2732 Py_VA_COPY(vargs2, vargs);
2733
2734 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002736 f = unicode_fromformat_arg(&writer, f, &vargs2);
2737 if (f == NULL)
2738 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002741 const char *p;
2742 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002743
Victor Stinnere215d962012-10-06 23:03:36 +02002744 p = f;
2745 do
2746 {
2747 if ((unsigned char)*p > 127) {
2748 PyErr_Format(PyExc_ValueError,
2749 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2750 "string, got a non-ASCII byte: 0x%02x",
2751 (unsigned char)*p);
2752 return NULL;
2753 }
2754 p++;
2755 }
2756 while (*p != '\0' && *p != '%');
2757 len = p - f;
2758
2759 if (*p == '\0')
2760 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002761
2762 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002763 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002764
2765 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002767 }
Victor Stinnere215d962012-10-06 23:03:36 +02002768 return _PyUnicodeWriter_Finish(&writer);
2769
2770 fail:
2771 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002773}
2774
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775PyObject *
2776PyUnicode_FromFormat(const char *format, ...)
2777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 PyObject* ret;
2779 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002780
2781#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002783#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 ret = PyUnicode_FromFormatV(format, vargs);
2787 va_end(vargs);
2788 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002789}
2790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791#ifdef HAVE_WCHAR_H
2792
Victor Stinner5593d8a2010-10-02 11:11:27 +00002793/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2794 convert a Unicode object to a wide character string.
2795
Victor Stinnerd88d9832011-09-06 02:00:05 +02002796 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002797 character) required to convert the unicode object. Ignore size argument.
2798
Victor Stinnerd88d9832011-09-06 02:00:05 +02002799 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002800 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002803unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002804 wchar_t *w,
2805 Py_ssize_t size)
2806{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002807 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 const wchar_t *wstr;
2809
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002810 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (wstr == NULL)
2812 return -1;
2813
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002815 if (size > res)
2816 size = res + 1;
2817 else
2818 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002820 return res;
2821 }
2822 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002824}
2825
2826Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002827PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002828 wchar_t *w,
2829 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830{
2831 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 PyErr_BadInternalCall();
2833 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002835 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836}
2837
Victor Stinner137c34c2010-09-29 10:25:54 +00002838wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002839PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002840 Py_ssize_t *size)
2841{
2842 wchar_t* buffer;
2843 Py_ssize_t buflen;
2844
2845 if (unicode == NULL) {
2846 PyErr_BadInternalCall();
2847 return NULL;
2848 }
2849
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002850 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002851 if (buflen == -1)
2852 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002853 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002854 PyErr_NoMemory();
2855 return NULL;
2856 }
2857
Victor Stinner137c34c2010-09-29 10:25:54 +00002858 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2859 if (buffer == NULL) {
2860 PyErr_NoMemory();
2861 return NULL;
2862 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002863 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002864 if (buflen == -1) {
2865 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002866 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002867 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002868 if (size != NULL)
2869 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002870 return buffer;
2871}
2872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002873#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874
Alexander Belopolsky40018472011-02-26 01:02:56 +00002875PyObject *
2876PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002877{
Victor Stinner8faf8212011-12-08 22:14:11 +01002878 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002879 PyErr_SetString(PyExc_ValueError,
2880 "chr() arg not in range(0x110000)");
2881 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002882 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002883
Victor Stinner985a82a2014-01-03 12:53:47 +01002884 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002885}
2886
Alexander Belopolsky40018472011-02-26 01:02:56 +00002887PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002888PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002890 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002893 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002894 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002895 Py_INCREF(obj);
2896 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897 }
2898 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 /* For a Unicode subtype that's not a Unicode object,
2900 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002901 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002902 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002903 PyErr_Format(PyExc_TypeError,
2904 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002905 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002906 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002907}
2908
Alexander Belopolsky40018472011-02-26 01:02:56 +00002909PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002910PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002911 const char *encoding,
2912 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002913{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002914 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002915 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002916
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 PyErr_BadInternalCall();
2919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002921
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002922 /* Decoding bytes objects is the most common case and should be fast */
2923 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002924 if (PyBytes_GET_SIZE(obj) == 0)
2925 _Py_RETURN_UNICODE_EMPTY();
2926 v = PyUnicode_Decode(
2927 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2928 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002929 return v;
2930 }
2931
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002932 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 PyErr_SetString(PyExc_TypeError,
2934 "decoding str is not supported");
2935 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2939 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2940 PyErr_Format(PyExc_TypeError,
2941 "coercing to str: need bytes, bytearray "
2942 "or buffer-like object, %.80s found",
2943 Py_TYPE(obj)->tp_name);
2944 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002945 }
Tim Petersced69f82003-09-16 20:30:58 +00002946
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002947 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002948 PyBuffer_Release(&buffer);
2949 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002951
Serhiy Storchaka05997252013-01-26 12:14:02 +02002952 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002953 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002954 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955}
2956
Victor Stinner600d3be2010-06-10 12:00:55 +00002957/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002958 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2959 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002960int
2961_Py_normalize_encoding(const char *encoding,
2962 char *lower,
2963 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002965 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002966 char *l;
2967 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002969 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002970 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002971 if (lower_len < 6)
2972 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002973 strcpy(lower, "utf-8");
2974 return 1;
2975 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002976 e = encoding;
2977 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002978 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002979 while (*e) {
2980 if (l == l_end)
2981 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002982 if (Py_ISUPPER(*e)) {
2983 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002984 }
2985 else if (*e == '_') {
2986 *l++ = '-';
2987 e++;
2988 }
2989 else {
2990 *l++ = *e++;
2991 }
2992 }
2993 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002994 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002995}
2996
Alexander Belopolsky40018472011-02-26 01:02:56 +00002997PyObject *
2998PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002999 Py_ssize_t size,
3000 const char *encoding,
3001 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003002{
3003 PyObject *buffer = NULL, *unicode;
3004 Py_buffer info;
3005 char lower[11]; /* Enough for any encoding shortcut */
3006
Fred Drakee4315f52000-05-09 19:53:39 +00003007 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003008 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003009 if ((strcmp(lower, "utf-8") == 0) ||
3010 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003011 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003012 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003013 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003014 (strcmp(lower, "iso-8859-1") == 0) ||
3015 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003016 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003017#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003018 else if (strcmp(lower, "mbcs") == 0)
3019 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003020#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003021 else if (strcmp(lower, "ascii") == 0)
3022 return PyUnicode_DecodeASCII(s, size, errors);
3023 else if (strcmp(lower, "utf-16") == 0)
3024 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3025 else if (strcmp(lower, "utf-32") == 0)
3026 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003030 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003031 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003032 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003033 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 if (buffer == NULL)
3035 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003036 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 if (unicode == NULL)
3038 goto onError;
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003041 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3042 "use codecs.decode() to decode to arbitrary types",
3043 encoding,
3044 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 Py_DECREF(unicode);
3046 goto onError;
3047 }
3048 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003049 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003050
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 Py_XDECREF(buffer);
3053 return NULL;
3054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
3057PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003060{
3061 PyObject *v;
3062
3063 if (!PyUnicode_Check(unicode)) {
3064 PyErr_BadArgument();
3065 goto onError;
3066 }
3067
3068 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003070
3071 /* Decode via the codec registry */
3072 v = PyCodec_Decode(unicode, encoding, errors);
3073 if (v == NULL)
3074 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003075 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003078 return NULL;
3079}
3080
Alexander Belopolsky40018472011-02-26 01:02:56 +00003081PyObject *
3082PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003083 const char *encoding,
3084 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003085{
3086 PyObject *v;
3087
3088 if (!PyUnicode_Check(unicode)) {
3089 PyErr_BadArgument();
3090 goto onError;
3091 }
3092
3093 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003095
3096 /* Decode via the codec registry */
3097 v = PyCodec_Decode(unicode, encoding, errors);
3098 if (v == NULL)
3099 goto onError;
3100 if (!PyUnicode_Check(v)) {
3101 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003102 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3103 "use codecs.decode() to decode to arbitrary types",
3104 encoding,
3105 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106 Py_DECREF(v);
3107 goto onError;
3108 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003109 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003112 return NULL;
3113}
3114
Alexander Belopolsky40018472011-02-26 01:02:56 +00003115PyObject *
3116PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003117 Py_ssize_t size,
3118 const char *encoding,
3119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
3121 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003122
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 unicode = PyUnicode_FromUnicode(s, size);
3124 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3127 Py_DECREF(unicode);
3128 return v;
3129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
3132PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003133 const char *encoding,
3134 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003135{
3136 PyObject *v;
3137
3138 if (!PyUnicode_Check(unicode)) {
3139 PyErr_BadArgument();
3140 goto onError;
3141 }
3142
3143 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003145
3146 /* Encode via the codec registry */
3147 v = PyCodec_Encode(unicode, encoding, errors);
3148 if (v == NULL)
3149 goto onError;
3150 return v;
3151
Benjamin Peterson29060642009-01-31 22:14:21 +00003152 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003153 return NULL;
3154}
3155
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156static size_t
3157wcstombs_errorpos(const wchar_t *wstr)
3158{
3159 size_t len;
3160#if SIZEOF_WCHAR_T == 2
3161 wchar_t buf[3];
3162#else
3163 wchar_t buf[2];
3164#endif
3165 char outbuf[MB_LEN_MAX];
3166 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003167
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168#if SIZEOF_WCHAR_T == 2
3169 buf[2] = 0;
3170#else
3171 buf[1] = 0;
3172#endif
3173 start = wstr;
3174 while (*wstr != L'\0')
3175 {
3176 previous = wstr;
3177#if SIZEOF_WCHAR_T == 2
3178 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3179 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3180 {
3181 buf[0] = wstr[0];
3182 buf[1] = wstr[1];
3183 wstr += 2;
3184 }
3185 else {
3186 buf[0] = *wstr;
3187 buf[1] = 0;
3188 wstr++;
3189 }
3190#else
3191 buf[0] = *wstr;
3192 wstr++;
3193#endif
3194 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003195 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003196 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003197 }
3198
3199 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003200 return 0;
3201}
3202
Victor Stinner1b579672011-12-17 05:47:23 +01003203static int
3204locale_error_handler(const char *errors, int *surrogateescape)
3205{
3206 if (errors == NULL) {
3207 *surrogateescape = 0;
3208 return 0;
3209 }
3210
3211 if (strcmp(errors, "strict") == 0) {
3212 *surrogateescape = 0;
3213 return 0;
3214 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003215 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003216 *surrogateescape = 1;
3217 return 0;
3218 }
3219 PyErr_Format(PyExc_ValueError,
3220 "only 'strict' and 'surrogateescape' error handlers "
3221 "are supported, not '%s'",
3222 errors);
3223 return -1;
3224}
3225
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003226PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003227PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003228{
3229 Py_ssize_t wlen, wlen2;
3230 wchar_t *wstr;
3231 PyObject *bytes = NULL;
3232 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003233 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234 PyObject *exc;
3235 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003236 int surrogateescape;
3237
3238 if (locale_error_handler(errors, &surrogateescape) < 0)
3239 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240
3241 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3242 if (wstr == NULL)
3243 return NULL;
3244
3245 wlen2 = wcslen(wstr);
3246 if (wlen2 != wlen) {
3247 PyMem_Free(wstr);
3248 PyErr_SetString(PyExc_TypeError, "embedded null character");
3249 return NULL;
3250 }
3251
3252 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003253 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003254 char *str;
3255
3256 str = _Py_wchar2char(wstr, &error_pos);
3257 if (str == NULL) {
3258 if (error_pos == (size_t)-1) {
3259 PyErr_NoMemory();
3260 PyMem_Free(wstr);
3261 return NULL;
3262 }
3263 else {
3264 goto encode_error;
3265 }
3266 }
3267 PyMem_Free(wstr);
3268
3269 bytes = PyBytes_FromString(str);
3270 PyMem_Free(str);
3271 }
3272 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003273 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003274 size_t len, len2;
3275
3276 len = wcstombs(NULL, wstr, 0);
3277 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003278 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279 goto encode_error;
3280 }
3281
3282 bytes = PyBytes_FromStringAndSize(NULL, len);
3283 if (bytes == NULL) {
3284 PyMem_Free(wstr);
3285 return NULL;
3286 }
3287
3288 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3289 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003290 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003291 goto encode_error;
3292 }
3293 PyMem_Free(wstr);
3294 }
3295 return bytes;
3296
3297encode_error:
3298 errmsg = strerror(errno);
3299 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003300
3301 if (error_pos == (size_t)-1)
3302 error_pos = wcstombs_errorpos(wstr);
3303
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003304 PyMem_Free(wstr);
3305 Py_XDECREF(bytes);
3306
Victor Stinner2f197072011-12-17 07:08:30 +01003307 if (errmsg != NULL) {
3308 size_t errlen;
3309 wstr = _Py_char2wchar(errmsg, &errlen);
3310 if (wstr != NULL) {
3311 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003312 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003313 } else
3314 errmsg = NULL;
3315 }
3316 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003317 reason = PyUnicode_FromString(
3318 "wcstombs() encountered an unencodable "
3319 "wide character");
3320 if (reason == NULL)
3321 return NULL;
3322
3323 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3324 "locale", unicode,
3325 (Py_ssize_t)error_pos,
3326 (Py_ssize_t)(error_pos+1),
3327 reason);
3328 Py_DECREF(reason);
3329 if (exc != NULL) {
3330 PyCodec_StrictErrors(exc);
3331 Py_XDECREF(exc);
3332 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003333 return NULL;
3334}
3335
Victor Stinnerad158722010-10-27 00:25:46 +00003336PyObject *
3337PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003338{
Victor Stinner99b95382011-07-04 14:23:54 +02003339#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003340 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003341#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003343#else
Victor Stinner793b5312011-04-27 00:24:21 +02003344 PyInterpreterState *interp = PyThreadState_GET()->interp;
3345 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3346 cannot use it to encode and decode filenames before it is loaded. Load
3347 the Python codec requires to encode at least its own filename. Use the C
3348 version of the locale codec until the codec registry is initialized and
3349 the Python codec is loaded.
3350
3351 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3352 cannot only rely on it: check also interp->fscodec_initialized for
3353 subinterpreters. */
3354 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003355 return PyUnicode_AsEncodedString(unicode,
3356 Py_FileSystemDefaultEncoding,
3357 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003358 }
3359 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003360 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003361 }
Victor Stinnerad158722010-10-27 00:25:46 +00003362#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003363}
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 const char *encoding,
3368 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369{
3370 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003371 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003372
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 if (!PyUnicode_Check(unicode)) {
3374 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 }
Fred Drakee4315f52000-05-09 19:53:39 +00003377
Fred Drakee4315f52000-05-09 19:53:39 +00003378 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003379 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003380 if ((strcmp(lower, "utf-8") == 0) ||
3381 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003382 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003383 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003384 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003385 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003387 }
Victor Stinner37296e82010-06-10 13:36:23 +00003388 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003389 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003390 (strcmp(lower, "iso-8859-1") == 0) ||
3391 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003393#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003394 else if (strcmp(lower, "mbcs") == 0)
3395 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003396#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003397 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400
3401 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003402 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003404 return NULL;
3405
3406 /* The normal path */
3407 if (PyBytes_Check(v))
3408 return v;
3409
3410 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003411 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003412 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003413 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003414
3415 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003416 "encoder %s returned bytearray instead of bytes; "
3417 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003418 encoding);
3419 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003420 Py_DECREF(v);
3421 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003423
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003424 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3425 Py_DECREF(v);
3426 return b;
3427 }
3428
3429 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003430 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3431 "use codecs.encode() to encode to arbitrary types",
3432 encoding,
3433 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003434 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003435 return NULL;
3436}
3437
Alexander Belopolsky40018472011-02-26 01:02:56 +00003438PyObject *
3439PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003440 const char *encoding,
3441 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003442{
3443 PyObject *v;
3444
3445 if (!PyUnicode_Check(unicode)) {
3446 PyErr_BadArgument();
3447 goto onError;
3448 }
3449
3450 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003451 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003452
3453 /* Encode via the codec registry */
3454 v = PyCodec_Encode(unicode, encoding, errors);
3455 if (v == NULL)
3456 goto onError;
3457 if (!PyUnicode_Check(v)) {
3458 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003459 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3460 "use codecs.encode() to encode to arbitrary types",
3461 encoding,
3462 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003463 Py_DECREF(v);
3464 goto onError;
3465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003467
Benjamin Peterson29060642009-01-31 22:14:21 +00003468 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 return NULL;
3470}
3471
Victor Stinner2f197072011-12-17 07:08:30 +01003472static size_t
3473mbstowcs_errorpos(const char *str, size_t len)
3474{
3475#ifdef HAVE_MBRTOWC
3476 const char *start = str;
3477 mbstate_t mbs;
3478 size_t converted;
3479 wchar_t ch;
3480
3481 memset(&mbs, 0, sizeof mbs);
3482 while (len)
3483 {
3484 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3485 if (converted == 0)
3486 /* Reached end of string */
3487 break;
3488 if (converted == (size_t)-1 || converted == (size_t)-2) {
3489 /* Conversion error or incomplete character */
3490 return str - start;
3491 }
3492 else {
3493 str += converted;
3494 len -= converted;
3495 }
3496 }
3497 /* failed to find the undecodable byte sequence */
3498 return 0;
3499#endif
3500 return 0;
3501}
3502
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003503PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003504PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003505 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506{
3507 wchar_t smallbuf[256];
3508 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3509 wchar_t *wstr;
3510 size_t wlen, wlen2;
3511 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003512 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003513 size_t error_pos;
3514 char *errmsg;
3515 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003516
3517 if (locale_error_handler(errors, &surrogateescape) < 0)
3518 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003519
3520 if (str[len] != '\0' || len != strlen(str)) {
3521 PyErr_SetString(PyExc_TypeError, "embedded null character");
3522 return NULL;
3523 }
3524
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003525 if (surrogateescape) {
3526 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003527 wstr = _Py_char2wchar(str, &wlen);
3528 if (wstr == NULL) {
3529 if (wlen == (size_t)-1)
3530 PyErr_NoMemory();
3531 else
3532 PyErr_SetFromErrno(PyExc_OSError);
3533 return NULL;
3534 }
3535
3536 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003537 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003538 }
3539 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003540 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003541#ifndef HAVE_BROKEN_MBSTOWCS
3542 wlen = mbstowcs(NULL, str, 0);
3543#else
3544 wlen = len;
3545#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003546 if (wlen == (size_t)-1)
3547 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003548 if (wlen+1 <= smallbuf_len) {
3549 wstr = smallbuf;
3550 }
3551 else {
3552 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3553 return PyErr_NoMemory();
3554
3555 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3556 if (!wstr)
3557 return PyErr_NoMemory();
3558 }
3559
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003560 wlen2 = mbstowcs(wstr, str, wlen+1);
3561 if (wlen2 == (size_t)-1) {
3562 if (wstr != smallbuf)
3563 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003564 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003565 }
3566#ifdef HAVE_BROKEN_MBSTOWCS
3567 assert(wlen2 == wlen);
3568#endif
3569 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3570 if (wstr != smallbuf)
3571 PyMem_Free(wstr);
3572 }
3573 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003574
3575decode_error:
3576 errmsg = strerror(errno);
3577 assert(errmsg != NULL);
3578
3579 error_pos = mbstowcs_errorpos(str, len);
3580 if (errmsg != NULL) {
3581 size_t errlen;
3582 wstr = _Py_char2wchar(errmsg, &errlen);
3583 if (wstr != NULL) {
3584 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003585 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003586 } else
3587 errmsg = NULL;
3588 }
3589 if (errmsg == NULL)
3590 reason = PyUnicode_FromString(
3591 "mbstowcs() encountered an invalid multibyte sequence");
3592 if (reason == NULL)
3593 return NULL;
3594
3595 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3596 "locale", str, len,
3597 (Py_ssize_t)error_pos,
3598 (Py_ssize_t)(error_pos+1),
3599 reason);
3600 Py_DECREF(reason);
3601 if (exc != NULL) {
3602 PyCodec_StrictErrors(exc);
3603 Py_XDECREF(exc);
3604 }
3605 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003606}
3607
3608PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003609PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003610{
3611 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003612 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003613}
3614
3615
3616PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003617PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003618 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003619 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3620}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003621
Christian Heimes5894ba72007-11-04 11:43:14 +00003622PyObject*
3623PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3624{
Victor Stinner99b95382011-07-04 14:23:54 +02003625#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003626 return PyUnicode_DecodeMBCS(s, size, NULL);
3627#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003628 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003629#else
Victor Stinner793b5312011-04-27 00:24:21 +02003630 PyInterpreterState *interp = PyThreadState_GET()->interp;
3631 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3632 cannot use it to encode and decode filenames before it is loaded. Load
3633 the Python codec requires to encode at least its own filename. Use the C
3634 version of the locale codec until the codec registry is initialized and
3635 the Python codec is loaded.
3636
3637 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3638 cannot only rely on it: check also interp->fscodec_initialized for
3639 subinterpreters. */
3640 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003641 return PyUnicode_Decode(s, size,
3642 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003643 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003644 }
3645 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003646 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003647 }
Victor Stinnerad158722010-10-27 00:25:46 +00003648#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003649}
3650
Martin v. Löwis011e8422009-05-05 04:43:17 +00003651
3652int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003653_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003654{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003655 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003656
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003657 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003658 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003659 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3660 PyUnicode_GET_LENGTH(str), '\0', 1);
3661 if (pos == -1)
3662 return 0;
3663 else
3664 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003665}
3666
Antoine Pitrou13348842012-01-29 18:36:34 +01003667int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003668PyUnicode_FSConverter(PyObject* arg, void* addr)
3669{
3670 PyObject *output = NULL;
3671 Py_ssize_t size;
3672 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003673 if (arg == NULL) {
3674 Py_DECREF(*(PyObject**)addr);
3675 return 1;
3676 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003677 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003678 output = arg;
3679 Py_INCREF(output);
3680 }
3681 else {
3682 arg = PyUnicode_FromObject(arg);
3683 if (!arg)
3684 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003685 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003686 Py_DECREF(arg);
3687 if (!output)
3688 return 0;
3689 if (!PyBytes_Check(output)) {
3690 Py_DECREF(output);
3691 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3692 return 0;
3693 }
3694 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003695 size = PyBytes_GET_SIZE(output);
3696 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003697 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003698 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003699 Py_DECREF(output);
3700 return 0;
3701 }
3702 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003703 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003704}
3705
3706
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003707int
3708PyUnicode_FSDecoder(PyObject* arg, void* addr)
3709{
3710 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003711 if (arg == NULL) {
3712 Py_DECREF(*(PyObject**)addr);
3713 return 1;
3714 }
3715 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003716 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003718 output = arg;
3719 Py_INCREF(output);
3720 }
3721 else {
3722 arg = PyBytes_FromObject(arg);
3723 if (!arg)
3724 return 0;
3725 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3726 PyBytes_GET_SIZE(arg));
3727 Py_DECREF(arg);
3728 if (!output)
3729 return 0;
3730 if (!PyUnicode_Check(output)) {
3731 Py_DECREF(output);
3732 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3733 return 0;
3734 }
3735 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003736 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003737 Py_DECREF(output);
3738 return 0;
3739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003741 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003742 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3743 Py_DECREF(output);
3744 return 0;
3745 }
3746 *(PyObject**)addr = output;
3747 return Py_CLEANUP_SUPPORTED;
3748}
3749
3750
Martin v. Löwis5b222132007-06-10 09:51:05 +00003751char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003753{
Christian Heimesf3863112007-11-22 07:46:41 +00003754 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 return NULL;
3759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003761 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003763 if (PyUnicode_UTF8(unicode) == NULL) {
3764 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3766 if (bytes == NULL)
3767 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3769 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003770 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 Py_DECREF(bytes);
3772 return NULL;
3773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3775 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3776 PyBytes_AS_STRING(bytes),
3777 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778 Py_DECREF(bytes);
3779 }
3780
3781 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003782 *psize = PyUnicode_UTF8_LENGTH(unicode);
3783 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003784}
3785
3786char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3790}
3791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003792Py_UNICODE *
3793PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 const unsigned char *one_byte;
3796#if SIZEOF_WCHAR_T == 4
3797 const Py_UCS2 *two_bytes;
3798#else
3799 const Py_UCS4 *four_bytes;
3800 const Py_UCS4 *ucs4_end;
3801 Py_ssize_t num_surrogates;
3802#endif
3803 wchar_t *w;
3804 wchar_t *wchar_end;
3805
3806 if (!PyUnicode_Check(unicode)) {
3807 PyErr_BadArgument();
3808 return NULL;
3809 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003812 assert(_PyUnicode_KIND(unicode) != 0);
3813 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3818 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819 num_surrogates = 0;
3820
3821 for (; four_bytes < ucs4_end; ++four_bytes) {
3822 if (*four_bytes > 0xFFFF)
3823 ++num_surrogates;
3824 }
3825
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3827 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3828 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 PyErr_NoMemory();
3830 return NULL;
3831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 w = _PyUnicode_WSTR(unicode);
3835 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3836 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3838 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003839 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003841 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3842 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 }
3844 else
3845 *w = *four_bytes;
3846
3847 if (w > wchar_end) {
3848 assert(0 && "Miscalculated string end");
3849 }
3850 }
3851 *w = 0;
3852#else
3853 /* sizeof(wchar_t) == 4 */
3854 Py_FatalError("Impossible unicode object state, wstr and str "
3855 "should share memory already.");
3856 return NULL;
3857#endif
3858 }
3859 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3861 (_PyUnicode_LENGTH(unicode) + 1));
3862 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 PyErr_NoMemory();
3864 return NULL;
3865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3867 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3868 w = _PyUnicode_WSTR(unicode);
3869 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003871 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3872 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 for (; w < wchar_end; ++one_byte, ++w)
3874 *w = *one_byte;
3875 /* null-terminate the wstr */
3876 *w = 0;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 for (; w < wchar_end; ++two_bytes, ++w)
3882 *w = *two_bytes;
3883 /* null-terminate the wstr */
3884 *w = 0;
3885#else
3886 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 PyObject_FREE(_PyUnicode_WSTR(unicode));
3888 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 Py_FatalError("Impossible unicode object state, wstr "
3890 "and str should share memory already.");
3891 return NULL;
3892#endif
3893 }
3894 else {
3895 assert(0 && "This should never happen.");
3896 }
3897 }
3898 }
3899 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 *size = PyUnicode_WSTR_LENGTH(unicode);
3901 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003902}
3903
Alexander Belopolsky40018472011-02-26 01:02:56 +00003904Py_UNICODE *
3905PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908}
3909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910
Alexander Belopolsky40018472011-02-26 01:02:56 +00003911Py_ssize_t
3912PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913{
3914 if (!PyUnicode_Check(unicode)) {
3915 PyErr_BadArgument();
3916 goto onError;
3917 }
3918 return PyUnicode_GET_SIZE(unicode);
3919
Benjamin Peterson29060642009-01-31 22:14:21 +00003920 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 return -1;
3922}
3923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924Py_ssize_t
3925PyUnicode_GetLength(PyObject *unicode)
3926{
Victor Stinner07621332012-06-16 04:53:46 +02003927 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928 PyErr_BadArgument();
3929 return -1;
3930 }
Victor Stinner07621332012-06-16 04:53:46 +02003931 if (PyUnicode_READY(unicode) == -1)
3932 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 return PyUnicode_GET_LENGTH(unicode);
3934}
3935
3936Py_UCS4
3937PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3938{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003939 void *data;
3940 int kind;
3941
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003942 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3943 PyErr_BadArgument();
3944 return (Py_UCS4)-1;
3945 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003946 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003947 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 return (Py_UCS4)-1;
3949 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003950 data = PyUnicode_DATA(unicode);
3951 kind = PyUnicode_KIND(unicode);
3952 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953}
3954
3955int
3956PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3957{
3958 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003959 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 return -1;
3961 }
Victor Stinner488fa492011-12-12 00:01:39 +01003962 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003963 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003964 PyErr_SetString(PyExc_IndexError, "string index out of range");
3965 return -1;
3966 }
Victor Stinner488fa492011-12-12 00:01:39 +01003967 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003968 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003969 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3970 PyErr_SetString(PyExc_ValueError, "character out of range");
3971 return -1;
3972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3974 index, ch);
3975 return 0;
3976}
3977
Alexander Belopolsky40018472011-02-26 01:02:56 +00003978const char *
3979PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003980{
Victor Stinner42cb4622010-09-01 19:39:01 +00003981 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003982}
3983
Victor Stinner554f3f02010-06-16 23:33:54 +00003984/* create or adjust a UnicodeDecodeError */
3985static void
3986make_decode_exception(PyObject **exceptionObject,
3987 const char *encoding,
3988 const char *input, Py_ssize_t length,
3989 Py_ssize_t startpos, Py_ssize_t endpos,
3990 const char *reason)
3991{
3992 if (*exceptionObject == NULL) {
3993 *exceptionObject = PyUnicodeDecodeError_Create(
3994 encoding, input, length, startpos, endpos, reason);
3995 }
3996 else {
3997 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3998 goto onError;
3999 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4000 goto onError;
4001 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4002 goto onError;
4003 }
4004 return;
4005
4006onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004007 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004008}
4009
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004010#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011/* error handling callback helper:
4012 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004013 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 and adjust various state variables.
4015 return 0 on success, -1 on error
4016*/
4017
Alexander Belopolsky40018472011-02-26 01:02:56 +00004018static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004019unicode_decode_call_errorhandler_wchar(
4020 const char *errors, PyObject **errorHandler,
4021 const char *encoding, const char *reason,
4022 const char **input, const char **inend, Py_ssize_t *startinpos,
4023 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4024 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004026 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027
4028 PyObject *restuple = NULL;
4029 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004030 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004031 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004032 Py_ssize_t requiredsize;
4033 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004034 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004035 wchar_t *repwstr;
4036 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004038 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4039 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 *errorHandler = PyCodec_LookupError(errors);
4043 if (*errorHandler == NULL)
4044 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 }
4046
Victor Stinner554f3f02010-06-16 23:33:54 +00004047 make_decode_exception(exceptionObject,
4048 encoding,
4049 *input, *inend - *input,
4050 *startinpos, *endinpos,
4051 reason);
4052 if (*exceptionObject == NULL)
4053 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
4055 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4056 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004059 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 }
4062 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004064
4065 /* Copy back the bytes variables, which might have been modified by the
4066 callback */
4067 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4068 if (!inputobj)
4069 goto onError;
4070 if (!PyBytes_Check(inputobj)) {
4071 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4072 }
4073 *input = PyBytes_AS_STRING(inputobj);
4074 insize = PyBytes_GET_SIZE(inputobj);
4075 *inend = *input + insize;
4076 /* we can DECREF safely, as the exception has another reference,
4077 so the object won't go away. */
4078 Py_DECREF(inputobj);
4079
4080 if (newpos<0)
4081 newpos = insize+newpos;
4082 if (newpos<0 || newpos>insize) {
4083 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4084 goto onError;
4085 }
4086
4087 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4088 if (repwstr == NULL)
4089 goto onError;
4090 /* need more space? (at least enough for what we
4091 have+the replacement+the rest of the string (starting
4092 at the new input position), so we won't have to check space
4093 when there are no errors in the rest of the string) */
4094 requiredsize = *outpos + repwlen + insize-newpos;
4095 if (requiredsize > outsize) {
4096 if (requiredsize < 2*outsize)
4097 requiredsize = 2*outsize;
4098 if (unicode_resize(output, requiredsize) < 0)
4099 goto onError;
4100 }
4101 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4102 *outpos += repwlen;
4103
4104 *endinpos = newpos;
4105 *inptr = *input + newpos;
4106
4107 /* we made it! */
4108 Py_XDECREF(restuple);
4109 return 0;
4110
4111 onError:
4112 Py_XDECREF(restuple);
4113 return -1;
4114}
4115#endif /* HAVE_MBCS */
4116
4117static int
4118unicode_decode_call_errorhandler_writer(
4119 const char *errors, PyObject **errorHandler,
4120 const char *encoding, const char *reason,
4121 const char **input, const char **inend, Py_ssize_t *startinpos,
4122 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4123 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4124{
4125 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4126
4127 PyObject *restuple = NULL;
4128 PyObject *repunicode = NULL;
4129 Py_ssize_t insize;
4130 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004131 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004132 PyObject *inputobj = NULL;
4133
4134 if (*errorHandler == NULL) {
4135 *errorHandler = PyCodec_LookupError(errors);
4136 if (*errorHandler == NULL)
4137 goto onError;
4138 }
4139
4140 make_decode_exception(exceptionObject,
4141 encoding,
4142 *input, *inend - *input,
4143 *startinpos, *endinpos,
4144 reason);
4145 if (*exceptionObject == NULL)
4146 goto onError;
4147
4148 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4149 if (restuple == NULL)
4150 goto onError;
4151 if (!PyTuple_Check(restuple)) {
4152 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4153 goto onError;
4154 }
4155 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004156 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
4158 /* Copy back the bytes variables, which might have been modified by the
4159 callback */
4160 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4161 if (!inputobj)
4162 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004163 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004165 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004166 *input = PyBytes_AS_STRING(inputobj);
4167 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004168 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004169 /* we can DECREF safely, as the exception has another reference,
4170 so the object won't go away. */
4171 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004175 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4177 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179
Victor Stinner8f674cc2013-04-17 23:02:17 +02004180 if (PyUnicode_READY(repunicode) < 0)
4181 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004182 replen = PyUnicode_GET_LENGTH(repunicode);
4183 writer->min_length += replen;
4184 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004185 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004186 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004187 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004190 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 Py_XDECREF(restuple);
4194 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199}
4200
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004201/* --- UTF-7 Codec -------------------------------------------------------- */
4202
Antoine Pitrou244651a2009-05-04 18:56:13 +00004203/* See RFC2152 for details. We encode conservatively and decode liberally. */
4204
4205/* Three simple macros defining base-64. */
4206
4207/* Is c a base-64 character? */
4208
4209#define IS_BASE64(c) \
4210 (((c) >= 'A' && (c) <= 'Z') || \
4211 ((c) >= 'a' && (c) <= 'z') || \
4212 ((c) >= '0' && (c) <= '9') || \
4213 (c) == '+' || (c) == '/')
4214
4215/* given that c is a base-64 character, what is its base-64 value? */
4216
4217#define FROM_BASE64(c) \
4218 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4219 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4220 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4221 (c) == '+' ? 62 : 63)
4222
4223/* What is the base-64 character of the bottom 6 bits of n? */
4224
4225#define TO_BASE64(n) \
4226 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4227
4228/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4229 * decoded as itself. We are permissive on decoding; the only ASCII
4230 * byte not decoding to itself is the + which begins a base64
4231 * string. */
4232
4233#define DECODE_DIRECT(c) \
4234 ((c) <= 127 && (c) != '+')
4235
4236/* The UTF-7 encoder treats ASCII characters differently according to
4237 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4238 * the above). See RFC2152. This array identifies these different
4239 * sets:
4240 * 0 : "Set D"
4241 * alphanumeric and '(),-./:?
4242 * 1 : "Set O"
4243 * !"#$%&*;<=>@[]^_`{|}
4244 * 2 : "whitespace"
4245 * ht nl cr sp
4246 * 3 : special (must be base64 encoded)
4247 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4248 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004249
Tim Petersced69f82003-09-16 20:30:58 +00004250static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004251char utf7_category[128] = {
4252/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4253 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4254/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4255 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4256/* sp ! " # $ % & ' ( ) * + , - . / */
4257 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4258/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4260/* @ A B C D E F G H I J K L M N O */
4261 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4262/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4264/* ` a b c d e f g h i j k l m n o */
4265 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4266/* p q r s t u v w x y z { | } ~ del */
4267 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268};
4269
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270/* ENCODE_DIRECT: this character should be encoded as itself. The
4271 * answer depends on whether we are encoding set O as itself, and also
4272 * on whether we are encoding whitespace as itself. RFC2152 makes it
4273 * clear that the answers to these questions vary between
4274 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004275
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276#define ENCODE_DIRECT(c, directO, directWS) \
4277 ((c) < 128 && (c) > 0 && \
4278 ((utf7_category[(c)] == 0) || \
4279 (directWS && (utf7_category[(c)] == 2)) || \
4280 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004281
Alexander Belopolsky40018472011-02-26 01:02:56 +00004282PyObject *
4283PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004284 Py_ssize_t size,
4285 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004287 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4288}
4289
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290/* The decoder. The only state we preserve is our read position,
4291 * i.e. how many characters we have consumed. So if we end in the
4292 * middle of a shift sequence we have to back off the read position
4293 * and the output to the beginning of the sequence, otherwise we lose
4294 * all the shift state (seen bits, number of bits seen, high
4295 * surrogate). */
4296
Alexander Belopolsky40018472011-02-26 01:02:56 +00004297PyObject *
4298PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004299 Py_ssize_t size,
4300 const char *errors,
4301 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004304 Py_ssize_t startinpos;
4305 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004307 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308 const char *errmsg = "";
4309 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004310 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 unsigned int base64bits = 0;
4312 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004313 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 PyObject *errorHandler = NULL;
4315 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317 if (size == 0) {
4318 if (consumed)
4319 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004320 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004321 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004324 _PyUnicodeWriter_Init(&writer);
4325 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326
4327 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328 e = s + size;
4329
4330 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004333 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335 if (inShift) { /* in a base-64 section */
4336 if (IS_BASE64(ch)) { /* consume a base-64 character */
4337 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4338 base64bits += 6;
4339 s++;
4340 if (base64bits >= 16) {
4341 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004342 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 base64bits -= 16;
4344 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004345 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004346 if (surrogate) {
4347 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004348 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4349 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004350 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004353 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 }
4355 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004356 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004357 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
4360 }
Victor Stinner551ac952011-11-29 22:58:13 +01004361 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 /* first surrogate */
4363 surrogate = outCh;
4364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004366 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
4369 }
4370 }
4371 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 inShift = 0;
4373 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004375 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004376 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004377 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (base64bits > 0) { /* left-over bits */
4380 if (base64bits >= 6) {
4381 /* We've seen at least one base-64 character */
4382 errmsg = "partial character in shift sequence";
4383 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 else {
4386 /* Some bits remain; they should be zero */
4387 if (base64buffer != 0) {
4388 errmsg = "non-zero padding bits in shift sequence";
4389 goto utf7Error;
4390 }
4391 }
4392 }
4393 if (ch != '-') {
4394 /* '-' is absorbed; other terminating
4395 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004396 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004397 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
4400 }
4401 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 s++; /* consume '+' */
4404 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004406 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 }
4409 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004413 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
4415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004418 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else {
4422 startinpos = s-starts;
4423 s++;
4424 errmsg = "unexpected special character";
4425 goto utf7Error;
4426 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 errors, &errorHandler,
4432 "utf7", errmsg,
4433 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004434 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436 }
4437
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 /* end of string */
4439
4440 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4441 /* if we're in an inconsistent state, that's an error */
4442 if (surrogate ||
4443 (base64bits >= 6) ||
4444 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 errors, &errorHandler,
4448 "utf7", "unterminated shift sequence",
4449 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 goto onError;
4452 if (s < e)
4453 goto restart;
4454 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456
4457 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004461 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004462 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004463 writer.kind, writer.data, shiftOutStart);
4464 Py_XDECREF(errorHandler);
4465 Py_XDECREF(exc);
4466 _PyUnicodeWriter_Dealloc(&writer);
4467 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004468 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004469 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 }
4471 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004474 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(errorHandler);
4477 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004478 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 Py_XDECREF(errorHandler);
4482 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 return NULL;
4485}
4486
4487
Alexander Belopolsky40018472011-02-26 01:02:56 +00004488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489_PyUnicode_EncodeUTF7(PyObject *str,
4490 int base64SetO,
4491 int base64WhiteSpace,
4492 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004494 int kind;
4495 void *data;
4496 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004497 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 unsigned int base64bits = 0;
4501 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 char * out;
4503 char * start;
4504
Benjamin Petersonbac79492012-01-14 13:34:47 -05004505 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004506 return NULL;
4507 kind = PyUnicode_KIND(str);
4508 data = PyUnicode_DATA(str);
4509 len = PyUnicode_GET_LENGTH(str);
4510
4511 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004514 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004515 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004516 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004517 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 if (v == NULL)
4519 return NULL;
4520
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004521 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004522 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004523 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 if (inShift) {
4526 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4527 /* shifting out */
4528 if (base64bits) { /* output remaining bits */
4529 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4530 base64buffer = 0;
4531 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 }
4533 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 /* Characters not in the BASE64 set implicitly unshift the sequence
4535 so no '-' is required, except if the character is itself a '-' */
4536 if (IS_BASE64(ch) || ch == '-') {
4537 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 *out++ = (char) ch;
4540 }
4541 else {
4542 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004543 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 else { /* not in a shift sequence */
4546 if (ch == '+') {
4547 *out++ = '+';
4548 *out++ = '-';
4549 }
4550 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4551 *out++ = (char) ch;
4552 }
4553 else {
4554 *out++ = '+';
4555 inShift = 1;
4556 goto encode_char;
4557 }
4558 }
4559 continue;
4560encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004562 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004563
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 /* code first surrogate */
4565 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004566 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 while (base64bits >= 6) {
4568 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4569 base64bits -= 6;
4570 }
4571 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004572 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits += 16;
4575 base64buffer = (base64buffer << 16) | ch;
4576 while (base64bits >= 6) {
4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4578 base64bits -= 6;
4579 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004580 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (base64bits)
4582 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4583 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004585 if (_PyBytes_Resize(&v, out - start) < 0)
4586 return NULL;
4587 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589PyObject *
4590PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4591 Py_ssize_t size,
4592 int base64SetO,
4593 int base64WhiteSpace,
4594 const char *errors)
4595{
4596 PyObject *result;
4597 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4598 if (tmp == NULL)
4599 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004600 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004601 base64WhiteSpace, errors);
4602 Py_DECREF(tmp);
4603 return result;
4604}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606#undef IS_BASE64
4607#undef FROM_BASE64
4608#undef TO_BASE64
4609#undef DECODE_DIRECT
4610#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612/* --- UTF-8 Codec -------------------------------------------------------- */
4613
Alexander Belopolsky40018472011-02-26 01:02:56 +00004614PyObject *
4615PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004616 Py_ssize_t size,
4617 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618{
Walter Dörwald69652032004-09-07 20:24:22 +00004619 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4620}
4621
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004622#include "stringlib/asciilib.h"
4623#include "stringlib/codecs.h"
4624#include "stringlib/undef.h"
4625
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004626#include "stringlib/ucs1lib.h"
4627#include "stringlib/codecs.h"
4628#include "stringlib/undef.h"
4629
4630#include "stringlib/ucs2lib.h"
4631#include "stringlib/codecs.h"
4632#include "stringlib/undef.h"
4633
4634#include "stringlib/ucs4lib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
Antoine Pitrouab868312009-01-10 15:40:25 +00004638/* Mask to quickly check whether a C 'long' contains a
4639 non-ASCII, UTF8-encoded char. */
4640#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004641# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004642#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004643# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004644#else
4645# error C 'long' size should be either 4 or 8!
4646#endif
4647
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648static Py_ssize_t
4649ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004652 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004654 /*
4655 * Issue #17237: m68k is a bit different from most architectures in
4656 * that objects do not use "natural alignment" - for example, int and
4657 * long are only aligned at 2-byte boundaries. Therefore the assert()
4658 * won't work; also, tests have shown that skipping the "optimised
4659 * version" will even speed up m68k.
4660 */
4661#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004663 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4664 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665 /* Fast path, see in STRINGLIB(utf8_decode) for
4666 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004667 /* Help allocation */
4668 const char *_p = p;
4669 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 while (_p < aligned_end) {
4671 unsigned long value = *(const unsigned long *) _p;
4672 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 *((unsigned long *)q) = value;
4675 _p += SIZEOF_LONG;
4676 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004677 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678 p = _p;
4679 while (p < end) {
4680 if ((unsigned char)*p & 0x80)
4681 break;
4682 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004687#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 while (p < end) {
4689 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4690 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004691 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004692 /* Help allocation */
4693 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004694 while (_p < aligned_end) {
4695 unsigned long value = *(unsigned long *) _p;
4696 if (value & ASCII_CHAR_MASK)
4697 break;
4698 _p += SIZEOF_LONG;
4699 }
4700 p = _p;
4701 if (_p == end)
4702 break;
4703 }
4704 if ((unsigned char)*p & 0x80)
4705 break;
4706 ++p;
4707 }
4708 memcpy(dest, start, p - start);
4709 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710}
Antoine Pitrouab868312009-01-10 15:40:25 +00004711
Victor Stinner785938e2011-12-11 20:09:03 +01004712PyObject *
4713PyUnicode_DecodeUTF8Stateful(const char *s,
4714 Py_ssize_t size,
4715 const char *errors,
4716 Py_ssize_t *consumed)
4717{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004719 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721
4722 Py_ssize_t startinpos;
4723 Py_ssize_t endinpos;
4724 const char *errmsg = "";
4725 PyObject *errorHandler = NULL;
4726 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004727
4728 if (size == 0) {
4729 if (consumed)
4730 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004731 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004732 }
4733
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4735 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004736 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737 *consumed = 1;
4738 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004739 }
4740
Victor Stinner8f674cc2013-04-17 23:02:17 +02004741 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004742 writer.min_length = size;
4743 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004744 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004745
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 writer.pos = ascii_decode(s, end, writer.data);
4747 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 while (s < end) {
4749 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004750 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004752 if (PyUnicode_IS_ASCII(writer.buffer))
4753 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004757 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 } else {
4759 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004760 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 }
4762
4763 switch (ch) {
4764 case 0:
4765 if (s == end || consumed)
4766 goto End;
4767 errmsg = "unexpected end of data";
4768 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004769 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 break;
4771 case 1:
4772 errmsg = "invalid start byte";
4773 startinpos = s - starts;
4774 endinpos = startinpos + 1;
4775 break;
4776 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004777 case 3:
4778 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 errmsg = "invalid continuation byte";
4780 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004781 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 break;
4783 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004784 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785 goto onError;
4786 continue;
4787 }
4788
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004789 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790 errors, &errorHandler,
4791 "utf-8", errmsg,
4792 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004793 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004795 }
4796
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 if (consumed)
4799 *consumed = s - starts;
4800
4801 Py_XDECREF(errorHandler);
4802 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004803 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804
4805onError:
4806 Py_XDECREF(errorHandler);
4807 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004810}
4811
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004812#ifdef __APPLE__
4813
4814/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004815 used to decode the command line arguments on Mac OS X.
4816
4817 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004818 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004819
4820wchar_t*
4821_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4822{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824 wchar_t *unicode;
4825 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004826
4827 /* Note: size will always be longer than the resulting Unicode
4828 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004829 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004830 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004831 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 if (!unicode)
4833 return NULL;
4834
4835 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004842#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845 if (ch > 0xFF) {
4846#if SIZEOF_WCHAR_T == 4
4847 assert(0);
4848#else
4849 assert(Py_UNICODE_IS_SURROGATE(ch));
4850 /* compute and append the two surrogates: */
4851 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4852 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4853#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 else {
4856 if (!ch && s == e)
4857 break;
4858 /* surrogateescape */
4859 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4860 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004861 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 return unicode;
4864}
4865
4866#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868/* Primary internal function which creates utf8 encoded bytes objects.
4869
4870 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004871 and allocate exactly as much space needed at the end. Else allocate the
4872 maximum possible needed (4 result bytes per Unicode character), and return
4873 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004874*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004875PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004876_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877{
Victor Stinner6099a032011-12-18 14:22:26 +01004878 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879 void *data;
4880 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 if (!PyUnicode_Check(unicode)) {
4883 PyErr_BadArgument();
4884 return NULL;
4885 }
4886
4887 if (PyUnicode_READY(unicode) == -1)
4888 return NULL;
4889
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004890 if (PyUnicode_UTF8(unicode))
4891 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4892 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
4894 kind = PyUnicode_KIND(unicode);
4895 data = PyUnicode_DATA(unicode);
4896 size = PyUnicode_GET_LENGTH(unicode);
4897
Benjamin Petersonead6b532011-12-20 17:23:42 -06004898 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004899 default:
4900 assert(0);
4901 case PyUnicode_1BYTE_KIND:
4902 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4903 assert(!PyUnicode_IS_ASCII(unicode));
4904 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4905 case PyUnicode_2BYTE_KIND:
4906 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4907 case PyUnicode_4BYTE_KIND:
4908 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910}
4911
Alexander Belopolsky40018472011-02-26 01:02:56 +00004912PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004913PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4914 Py_ssize_t size,
4915 const char *errors)
4916{
4917 PyObject *v, *unicode;
4918
4919 unicode = PyUnicode_FromUnicode(s, size);
4920 if (unicode == NULL)
4921 return NULL;
4922 v = _PyUnicode_AsUTF8String(unicode, errors);
4923 Py_DECREF(unicode);
4924 return v;
4925}
4926
4927PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004928PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004930 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931}
4932
Walter Dörwald41980ca2007-08-16 21:55:45 +00004933/* --- UTF-32 Codec ------------------------------------------------------- */
4934
4935PyObject *
4936PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 Py_ssize_t size,
4938 const char *errors,
4939 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940{
4941 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4942}
4943
4944PyObject *
4945PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors,
4948 int *byteorder,
4949 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950{
4951 const char *starts = s;
4952 Py_ssize_t startinpos;
4953 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004954 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004955 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004956 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004957 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 PyObject *errorHandler = NULL;
4960 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004961
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962 q = (unsigned char *)s;
4963 e = q + size;
4964
4965 if (byteorder)
4966 bo = *byteorder;
4967
4968 /* Check for BOM marks (U+FEFF) in the input and adjust current
4969 byte order setting accordingly. In native mode, the leading BOM
4970 mark is skipped, in all other modes, it is copied to the output
4971 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004972 if (bo == 0 && size >= 4) {
4973 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4974 if (bom == 0x0000FEFF) {
4975 bo = -1;
4976 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004978 else if (bom == 0xFFFE0000) {
4979 bo = 1;
4980 q += 4;
4981 }
4982 if (byteorder)
4983 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 }
4985
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 if (q == e) {
4987 if (consumed)
4988 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004989 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990 }
4991
Victor Stinnere64322e2012-10-30 23:12:47 +01004992#ifdef WORDS_BIGENDIAN
4993 le = bo < 0;
4994#else
4995 le = bo <= 0;
4996#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004997 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004998
Victor Stinner8f674cc2013-04-17 23:02:17 +02004999 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005000 writer.min_length = (e - q + 3) / 4;
5001 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005003
Victor Stinnere64322e2012-10-30 23:12:47 +01005004 while (1) {
5005 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005007
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 enum PyUnicode_Kind kind = writer.kind;
5010 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005011 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 if (le) {
5014 do {
5015 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5016 if (ch > maxch)
5017 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005018 if (kind != PyUnicode_1BYTE_KIND &&
5019 Py_UNICODE_IS_SURROGATE(ch))
5020 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005021 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 q += 4;
5023 } while (q <= last);
5024 }
5025 else {
5026 do {
5027 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5028 if (ch > maxch)
5029 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005030 if (kind != PyUnicode_1BYTE_KIND &&
5031 Py_UNICODE_IS_SURROGATE(ch))
5032 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005033 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 q += 4;
5035 } while (q <= last);
5036 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005037 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005038 }
5039
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005040 if (Py_UNICODE_IS_SURROGATE(ch)) {
5041 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5042 startinpos = ((const char *)q) - starts;
5043 endinpos = startinpos + 4;
5044 }
5045 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005046 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005048 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005050 startinpos = ((const char *)q) - starts;
5051 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005053 else {
5054 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005055 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005056 goto onError;
5057 q += 4;
5058 continue;
5059 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005061 startinpos = ((const char *)q) - starts;
5062 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005064
5065 /* The remaining input chars are ignored if the callback
5066 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005067 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005069 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005071 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073 }
5074
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 Py_XDECREF(errorHandler);
5079 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005080 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005083 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 Py_XDECREF(errorHandler);
5085 Py_XDECREF(exc);
5086 return NULL;
5087}
5088
5089PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005090_PyUnicode_EncodeUTF32(PyObject *str,
5091 const char *errors,
5092 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005094 int kind;
5095 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005096 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005098 unsigned char *p;
5099 Py_ssize_t nsize, i;
5100 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005101#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005102 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005104 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005106 const char *encoding;
5107 PyObject *errorHandler = NULL;
5108 PyObject *exc = NULL;
5109 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110
Serhiy Storchaka30793282014-01-04 22:44:01 +02005111#define STORECHAR(CH) \
5112 do { \
5113 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5114 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5115 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5116 p[iorder[0]] = (CH) & 0xff; \
5117 p += 4; \
5118 } while(0)
5119
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (!PyUnicode_Check(str)) {
5121 PyErr_BadArgument();
5122 return NULL;
5123 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005124 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005125 return NULL;
5126 kind = PyUnicode_KIND(str);
5127 data = PyUnicode_DATA(str);
5128 len = PyUnicode_GET_LENGTH(str);
5129
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005130 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005131 if (nsize > PY_SSIZE_T_MAX / 4)
5132 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005133 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (v == NULL)
5135 return NULL;
5136
Serhiy Storchaka30793282014-01-04 22:44:01 +02005137 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005139 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005140 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005141 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142
Serhiy Storchaka30793282014-01-04 22:44:01 +02005143 if (byteorder == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005149 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005150 }
5151 else if (byteorder == 1) {
5152 /* force BE */
5153 iorder[0] = 3;
5154 iorder[1] = 2;
5155 iorder[2] = 1;
5156 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005157 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005158 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005159 else
5160 encoding = "utf-32";
5161
5162 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005163 for (i = 0; i < len; i++)
5164 STORECHAR(PyUnicode_READ(kind, data, i));
5165 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166 }
5167
Serhiy Storchaka30793282014-01-04 22:44:01 +02005168 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005169 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005170 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5171 i++;
5172 assert(ch <= MAX_UNICODE);
5173 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5174 STORECHAR(ch);
5175 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005177
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005178 rep = unicode_encode_call_errorhandler(
5179 errors, &errorHandler,
5180 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005181 str, &exc, i-1, i, &i);
5182
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 if (!rep)
5184 goto error;
5185
5186 if (PyBytes_Check(rep)) {
5187 repsize = PyBytes_GET_SIZE(rep);
5188 if (repsize & 3) {
5189 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005190 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 "surrogates not allowed");
5192 goto error;
5193 }
5194 moreunits = repsize / 4;
5195 }
5196 else {
5197 assert(PyUnicode_Check(rep));
5198 if (PyUnicode_READY(rep) < 0)
5199 goto error;
5200 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5201 if (!PyUnicode_IS_ASCII(rep)) {
5202 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005203 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005204 "surrogates not allowed");
5205 goto error;
5206 }
5207 }
5208
5209 /* four bytes are reserved for each surrogate */
5210 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005211 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005212 Py_ssize_t morebytes = 4 * (moreunits - 1);
5213 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5214 /* integer overflow */
5215 PyErr_NoMemory();
5216 goto error;
5217 }
5218 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5219 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005220 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005221 }
5222
5223 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005224 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5225 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005226 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005227 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005228 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005229 repdata = PyUnicode_1BYTE_DATA(rep);
5230 while (repsize--) {
5231 Py_UCS4 ch = *repdata++;
5232 STORECHAR(ch);
5233 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005234 }
5235
5236 Py_CLEAR(rep);
5237 }
5238
5239 /* Cut back to size actually needed. This is necessary for, for example,
5240 encoding of a string containing isolated surrogates and the 'ignore'
5241 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005242 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005243 if (nsize != PyBytes_GET_SIZE(v))
5244 _PyBytes_Resize(&v, nsize);
5245 Py_XDECREF(errorHandler);
5246 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005247 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005248 error:
5249 Py_XDECREF(rep);
5250 Py_XDECREF(errorHandler);
5251 Py_XDECREF(exc);
5252 Py_XDECREF(v);
5253 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005254#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255}
5256
Alexander Belopolsky40018472011-02-26 01:02:56 +00005257PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005258PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5259 Py_ssize_t size,
5260 const char *errors,
5261 int byteorder)
5262{
5263 PyObject *result;
5264 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5265 if (tmp == NULL)
5266 return NULL;
5267 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5268 Py_DECREF(tmp);
5269 return result;
5270}
5271
5272PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005273PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274{
Victor Stinnerb960b342011-11-20 19:12:52 +01005275 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276}
5277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278/* --- UTF-16 Codec ------------------------------------------------------- */
5279
Tim Peters772747b2001-08-09 22:21:55 +00005280PyObject *
5281PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 Py_ssize_t size,
5283 const char *errors,
5284 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
Walter Dörwald69652032004-09-07 20:24:22 +00005286 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5287}
5288
5289PyObject *
5290PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 Py_ssize_t size,
5292 const char *errors,
5293 int *byteorder,
5294 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t startinpos;
5298 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005300 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005301 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005302 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005303 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 PyObject *errorHandler = NULL;
5305 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005306 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Tim Peters772747b2001-08-09 22:21:55 +00005308 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005309 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
5311 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005312 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005314 /* Check for BOM marks (U+FEFF) in the input and adjust current
5315 byte order setting accordingly. In native mode, the leading BOM
5316 mark is skipped, in all other modes, it is copied to the output
5317 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 if (bo == 0 && size >= 2) {
5319 const Py_UCS4 bom = (q[1] << 8) | q[0];
5320 if (bom == 0xFEFF) {
5321 q += 2;
5322 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005324 else if (bom == 0xFFFE) {
5325 q += 2;
5326 bo = 1;
5327 }
5328 if (byteorder)
5329 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 if (q == e) {
5333 if (consumed)
5334 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005335 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005336 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005337
Christian Heimes743e0cd2012-10-17 23:52:17 +02005338#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005339 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005340 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005341#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005344#endif
Tim Peters772747b2001-08-09 22:21:55 +00005345
Antoine Pitrou63065d72012-05-15 23:48:04 +02005346 /* Note: size will always be longer than the resulting Unicode
5347 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005348 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005349 writer.min_length = (e - q + 1) / 2;
5350 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005351 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005352
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 while (1) {
5354 Py_UCS4 ch = 0;
5355 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005356 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005357 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005359 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005361 native_ordering);
5362 else
5363 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005365 native_ordering);
5366 } else if (kind == PyUnicode_2BYTE_KIND) {
5367 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005369 native_ordering);
5370 } else {
5371 assert(kind == PyUnicode_4BYTE_KIND);
5372 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005374 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005375 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377
Antoine Pitrou63065d72012-05-15 23:48:04 +02005378 switch (ch)
5379 {
5380 case 0:
5381 /* remaining byte at the end? (size should be even) */
5382 if (q == e || consumed)
5383 goto End;
5384 errmsg = "truncated data";
5385 startinpos = ((const char *)q) - starts;
5386 endinpos = ((const char *)e) - starts;
5387 break;
5388 /* The remaining input chars are ignored if the callback
5389 chooses to skip the input */
5390 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005391 q -= 2;
5392 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005393 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005394 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005395 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005396 endinpos = ((const char *)e) - starts;
5397 break;
5398 case 2:
5399 errmsg = "illegal encoding";
5400 startinpos = ((const char *)q) - 2 - starts;
5401 endinpos = startinpos + 2;
5402 break;
5403 case 3:
5404 errmsg = "illegal UTF-16 surrogate";
5405 startinpos = ((const char *)q) - 4 - starts;
5406 endinpos = startinpos + 2;
5407 break;
5408 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005409 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005410 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 continue;
5412 }
5413
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005414 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005415 errors,
5416 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005417 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005418 &starts,
5419 (const char **)&e,
5420 &startinpos,
5421 &endinpos,
5422 &exc,
5423 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005424 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 }
5427
Antoine Pitrou63065d72012-05-15 23:48:04 +02005428End:
Walter Dörwald69652032004-09-07 20:24:22 +00005429 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 Py_XDECREF(errorHandler);
5433 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005434 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005437 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 Py_XDECREF(errorHandler);
5439 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return NULL;
5441}
5442
Tim Peters772747b2001-08-09 22:21:55 +00005443PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444_PyUnicode_EncodeUTF16(PyObject *str,
5445 const char *errors,
5446 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005448 enum PyUnicode_Kind kind;
5449 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005450 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005451 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005452 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005453 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005454#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005455 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005456#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005457 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005458#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 const char *encoding;
5460 Py_ssize_t nsize, pos;
5461 PyObject *errorHandler = NULL;
5462 PyObject *exc = NULL;
5463 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 if (!PyUnicode_Check(str)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005469 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005470 return NULL;
5471 kind = PyUnicode_KIND(str);
5472 data = PyUnicode_DATA(str);
5473 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005474
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005476 if (kind == PyUnicode_4BYTE_KIND) {
5477 const Py_UCS4 *in = (const Py_UCS4 *)data;
5478 const Py_UCS4 *end = in + len;
5479 while (in < end)
5480 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005481 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005482 }
5483 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 nsize = len + pairs + (byteorder == 0);
5486 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 if (v == NULL)
5488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005490 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005491 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005492 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005494 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005495 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005496 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005497
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 if (kind == PyUnicode_1BYTE_KIND) {
5499 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5500 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005501 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005502
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 if (byteorder < 0)
5504 encoding = "utf-16-le";
5505 else if (byteorder > 0)
5506 encoding = "utf-16-be";
5507 else
5508 encoding = "utf-16";
5509
5510 pos = 0;
5511 while (pos < len) {
5512 Py_ssize_t repsize, moreunits;
5513
5514 if (kind == PyUnicode_2BYTE_KIND) {
5515 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5516 &out, native_ordering);
5517 }
5518 else {
5519 assert(kind == PyUnicode_4BYTE_KIND);
5520 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5521 &out, native_ordering);
5522 }
5523 if (pos == len)
5524 break;
5525
5526 rep = unicode_encode_call_errorhandler(
5527 errors, &errorHandler,
5528 encoding, "surrogates not allowed",
5529 str, &exc, pos, pos + 1, &pos);
5530 if (!rep)
5531 goto error;
5532
5533 if (PyBytes_Check(rep)) {
5534 repsize = PyBytes_GET_SIZE(rep);
5535 if (repsize & 1) {
5536 raise_encode_exception(&exc, encoding,
5537 str, pos - 1, pos,
5538 "surrogates not allowed");
5539 goto error;
5540 }
5541 moreunits = repsize / 2;
5542 }
5543 else {
5544 assert(PyUnicode_Check(rep));
5545 if (PyUnicode_READY(rep) < 0)
5546 goto error;
5547 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5548 if (!PyUnicode_IS_ASCII(rep)) {
5549 raise_encode_exception(&exc, encoding,
5550 str, pos - 1, pos,
5551 "surrogates not allowed");
5552 goto error;
5553 }
5554 }
5555
5556 /* two bytes are reserved for each surrogate */
5557 if (moreunits > 1) {
5558 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5559 Py_ssize_t morebytes = 2 * (moreunits - 1);
5560 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5561 /* integer overflow */
5562 PyErr_NoMemory();
5563 goto error;
5564 }
5565 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5566 goto error;
5567 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5568 }
5569
5570 if (PyBytes_Check(rep)) {
5571 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5572 out += moreunits;
5573 } else /* rep is unicode */ {
5574 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5575 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5576 &out, native_ordering);
5577 }
5578
5579 Py_CLEAR(rep);
5580 }
5581
5582 /* Cut back to size actually needed. This is necessary for, for example,
5583 encoding of a string containing isolated surrogates and the 'ignore' handler
5584 is used. */
5585 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5586 if (nsize != PyBytes_GET_SIZE(v))
5587 _PyBytes_Resize(&v, nsize);
5588 Py_XDECREF(errorHandler);
5589 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005590 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005591 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005592 error:
5593 Py_XDECREF(rep);
5594 Py_XDECREF(errorHandler);
5595 Py_XDECREF(exc);
5596 Py_XDECREF(v);
5597 return NULL;
5598#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599}
5600
Alexander Belopolsky40018472011-02-26 01:02:56 +00005601PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005602PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5603 Py_ssize_t size,
5604 const char *errors,
5605 int byteorder)
5606{
5607 PyObject *result;
5608 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5609 if (tmp == NULL)
5610 return NULL;
5611 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5612 Py_DECREF(tmp);
5613 return result;
5614}
5615
5616PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005617PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620}
5621
5622/* --- Unicode Escape Codec ----------------------------------------------- */
5623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5625 if all the escapes in the string make it still a valid ASCII string.
5626 Returns -1 if any escapes were found which cause the string to
5627 pop out of ASCII range. Otherwise returns the length of the
5628 required buffer to hold the string.
5629 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005630static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5632{
5633 const unsigned char *p = (const unsigned char *)s;
5634 const unsigned char *end = p + size;
5635 Py_ssize_t length = 0;
5636
5637 if (size < 0)
5638 return -1;
5639
5640 for (; p < end; ++p) {
5641 if (*p > 127) {
5642 /* Non-ASCII */
5643 return -1;
5644 }
5645 else if (*p != '\\') {
5646 /* Normal character */
5647 ++length;
5648 }
5649 else {
5650 /* Backslash-escape, check next char */
5651 ++p;
5652 /* Escape sequence reaches till end of string or
5653 non-ASCII follow-up. */
5654 if (p >= end || *p > 127)
5655 return -1;
5656 switch (*p) {
5657 case '\n':
5658 /* backslash + \n result in zero characters */
5659 break;
5660 case '\\': case '\'': case '\"':
5661 case 'b': case 'f': case 't':
5662 case 'n': case 'r': case 'v': case 'a':
5663 ++length;
5664 break;
5665 case '0': case '1': case '2': case '3':
5666 case '4': case '5': case '6': case '7':
5667 case 'x': case 'u': case 'U': case 'N':
5668 /* these do not guarantee ASCII characters */
5669 return -1;
5670 default:
5671 /* count the backslash + the other character */
5672 length += 2;
5673 }
5674 }
5675 }
5676 return length;
5677}
5678
Fredrik Lundh06d12682001-01-24 07:59:11 +00005679static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005680
Alexander Belopolsky40018472011-02-26 01:02:56 +00005681PyObject *
5682PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005683 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005684 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005687 Py_ssize_t startinpos;
5688 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 char* message;
5692 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 PyObject *errorHandler = NULL;
5694 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005696
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005697 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005698 if (len == 0)
5699 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005700
5701 /* After length_of_escaped_ascii_string() there are two alternatives,
5702 either the string is pure ASCII with named escapes like \n, etc.
5703 and we determined it's exact size (common case)
5704 or it contains \x, \u, ... escape sequences. then we create a
5705 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005706 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005708 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 }
5710 else {
5711 /* Escaped strings will always be longer than the resulting
5712 Unicode string, so we start with size here and then reduce the
5713 length after conversion to the true value.
5714 (but if the error callback returns a long replacement string
5715 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005716 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005717 }
5718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005720 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005722
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 while (s < end) {
5724 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005725 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
5728 /* Non-escape characters are interpreted as Unicode ordinals */
5729 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005730 x = (unsigned char)*s;
5731 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005732 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 continue;
5735 }
5736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 /* \ - Escapes */
5739 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005740 c = *s++;
5741 if (s > end)
5742 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005743
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005744 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005747#define WRITECHAR(ch) \
5748 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005749 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005751 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005752
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005754 case '\\': WRITECHAR('\\'); break;
5755 case '\'': WRITECHAR('\''); break;
5756 case '\"': WRITECHAR('\"'); break;
5757 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005758 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 case 'f': WRITECHAR('\014'); break;
5760 case 't': WRITECHAR('\t'); break;
5761 case 'n': WRITECHAR('\n'); break;
5762 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005763 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005764 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005765 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 case '0': case '1': case '2': case '3':
5770 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005771 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005772 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005773 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005774 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005775 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 break;
5779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* hex escapes */
5781 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005783 digits = 2;
5784 message = "truncated \\xXX escape";
5785 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005789 digits = 4;
5790 message = "truncated \\uXXXX escape";
5791 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005794 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005795 digits = 8;
5796 message = "truncated \\UXXXXXXXX escape";
5797 hexescape:
5798 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005799 if (end - s < digits) {
5800 /* count only hex digits */
5801 for (; s < end; ++s) {
5802 c = (unsigned char)*s;
5803 if (!Py_ISXDIGIT(c))
5804 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005805 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005806 goto error;
5807 }
5808 for (; digits--; ++s) {
5809 c = (unsigned char)*s;
5810 if (!Py_ISXDIGIT(c))
5811 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005812 chr = (chr<<4) & ~0xF;
5813 if (c >= '0' && c <= '9')
5814 chr += c - '0';
5815 else if (c >= 'a' && c <= 'f')
5816 chr += 10 + c - 'a';
5817 else
5818 chr += 10 + c - 'A';
5819 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005820 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 /* _decoding_error will have already written into the
5822 target buffer. */
5823 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005824 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005825 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005826 message = "illegal Unicode character";
5827 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005828 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005829 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 break;
5831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 case 'N':
5834 message = "malformed \\N character escape";
5835 if (ucnhash_CAPI == NULL) {
5836 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5838 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 if (ucnhash_CAPI == NULL)
5840 goto ucnhashError;
5841 }
5842 if (*s == '{') {
5843 const char *start = s+1;
5844 /* look for the closing brace */
5845 while (*s != '}' && s < end)
5846 s++;
5847 if (s > start && s < end && *s == '}') {
5848 /* found a name. look it up in the unicode database */
5849 message = "unknown Unicode character name";
5850 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005851 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005852 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005853 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005854 goto store;
5855 }
5856 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005857 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005858
5859 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005860 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 message = "\\ at end of string";
5862 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005863 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005864 }
5865 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005866 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005867 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005868 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005869 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005871 continue;
5872
5873 error:
5874 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005875 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005876 errors, &errorHandler,
5877 "unicodeescape", message,
5878 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005879 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005880 goto onError;
5881 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005883#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005884
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005885 Py_XDECREF(errorHandler);
5886 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005887 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005888
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005890 PyErr_SetString(
5891 PyExc_UnicodeError,
5892 "\\N escapes not supported (can't load unicodedata module)"
5893 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005897 return NULL;
5898
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 Py_XDECREF(errorHandler);
5902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 return NULL;
5904}
5905
5906/* Return a Unicode-Escape string version of the Unicode object.
5907
5908 If quotes is true, the string is enclosed in u"" or u'' quotes as
5909 appropriate.
5910
5911*/
5912
Alexander Belopolsky40018472011-02-26 01:02:56 +00005913PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005917 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 int kind;
5920 void *data;
5921 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922
Ezio Melottie7f90372012-10-05 03:33:31 +03005923 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005924 escape.
5925
Ezio Melottie7f90372012-10-05 03:33:31 +03005926 For UCS1 strings it's '\xxx', 4 bytes per source character.
5927 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5928 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005929 */
5930
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 if (!PyUnicode_Check(unicode)) {
5932 PyErr_BadArgument();
5933 return NULL;
5934 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005935 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936 return NULL;
5937 len = PyUnicode_GET_LENGTH(unicode);
5938 kind = PyUnicode_KIND(unicode);
5939 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005940 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5942 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5943 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5944 }
5945
5946 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 return PyBytes_FromStringAndSize(NULL, 0);
5948
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005951
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 if (repr == NULL)
5957 return NULL;
5958
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005959 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005962 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005963
Walter Dörwald79e913e2007-05-12 11:08:06 +00005964 /* Escape backslashes */
5965 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 *p++ = '\\';
5967 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005968 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005971 /* Map 21-bit characters to '\U00xxxxxx' */
5972 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005973 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005974 *p++ = '\\';
5975 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005976 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5982 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5983 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005985 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005988 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 *p++ = '\\';
5990 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005991 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5992 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5993 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5994 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005996
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005997 /* Map special whitespace to '\t', \n', '\r' */
5998 else if (ch == '\t') {
5999 *p++ = '\\';
6000 *p++ = 't';
6001 }
6002 else if (ch == '\n') {
6003 *p++ = '\\';
6004 *p++ = 'n';
6005 }
6006 else if (ch == '\r') {
6007 *p++ = '\\';
6008 *p++ = 'r';
6009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006010
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006011 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006012 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006014 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006015 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6016 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006017 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006018
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 /* Copy everything else as-is */
6020 else
6021 *p++ = (char) ch;
6022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 assert(p - PyBytes_AS_STRING(repr) > 0);
6025 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6026 return NULL;
6027 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
Alexander Belopolsky40018472011-02-26 01:02:56 +00006030PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6032 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 PyObject *result;
6035 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6036 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 result = PyUnicode_AsUnicodeEscapeString(tmp);
6039 Py_DECREF(tmp);
6040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041}
6042
6043/* --- Raw Unicode Escape Codec ------------------------------------------- */
6044
Alexander Belopolsky40018472011-02-26 01:02:56 +00006045PyObject *
6046PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006047 Py_ssize_t size,
6048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006051 Py_ssize_t startinpos;
6052 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 const char *end;
6055 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 PyObject *errorHandler = NULL;
6057 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006058
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006059 if (size == 0)
6060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 /* Escaped strings will always be longer than the resulting
6063 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 length after conversion to the true value. (But decoding error
6065 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006066 _PyUnicodeWriter_Init(&writer);
6067 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 end = s + size;
6070 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 unsigned char c;
6072 Py_UCS4 x;
6073 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006074 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 /* Non-escape characters are interpreted as Unicode ordinals */
6077 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006078 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006079 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006080 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 startinpos = s-starts;
6084
6085 /* \u-escapes are only interpreted iff the number of leading
6086 backslashes if odd */
6087 bs = s;
6088 for (;s < end;) {
6089 if (*s != '\\')
6090 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006091 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006092 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006093 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 }
6095 if (((s - bs) & 1) == 0 ||
6096 s >= end ||
6097 (*s != 'u' && *s != 'U')) {
6098 continue;
6099 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006100 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 count = *s=='u' ? 4 : 8;
6102 s++;
6103
6104 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 for (x = 0, i = 0; i < count; ++i, ++s) {
6106 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006107 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006109 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 errors, &errorHandler,
6111 "rawunicodeescape", "truncated \\uXXXX",
6112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006113 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 goto onError;
6115 goto nextByte;
6116 }
6117 x = (x<<4) & ~0xF;
6118 if (c >= '0' && c <= '9')
6119 x += c - '0';
6120 else if (c >= 'a' && c <= 'f')
6121 x += 10 + c - 'a';
6122 else
6123 x += 10 + c - 'A';
6124 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006125 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006126 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006127 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006128 }
6129 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006130 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006131 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006132 errors, &errorHandler,
6133 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006135 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 nextByte:
6139 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006143 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006144
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006146 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return NULL;
6150}
6151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006156 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 char *p;
6158 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 Py_ssize_t expandsize, pos;
6160 int kind;
6161 void *data;
6162 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 if (!PyUnicode_Check(unicode)) {
6165 PyErr_BadArgument();
6166 return NULL;
6167 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006168 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 return NULL;
6170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
6172 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006173 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6174 bytes, and 1 byte characters 4. */
6175 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006176
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (repr == NULL)
6182 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006184 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006187 for (pos = 0; pos < len; pos++) {
6188 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 /* Map 32-bit characters to '\Uxxxxxxxx' */
6190 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006191 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006192 *p++ = '\\';
6193 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006194 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6195 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6199 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6201 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006202 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006204 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 *p++ = '\\';
6206 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006207 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6208 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6210 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 /* Copy everything else as-is */
6213 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = (char) ch;
6215 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006216
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217 assert(p > q);
6218 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006219 return NULL;
6220 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Alexander Belopolsky40018472011-02-26 01:02:56 +00006223PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6225 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006227 PyObject *result;
6228 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6229 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006230 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6232 Py_DECREF(tmp);
6233 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234}
6235
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236/* --- Unicode Internal Codec ------------------------------------------- */
6237
Alexander Belopolsky40018472011-02-26 01:02:56 +00006238PyObject *
6239_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006240 Py_ssize_t size,
6241 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006242{
6243 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006244 Py_ssize_t startinpos;
6245 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006246 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 const char *end;
6248 const char *reason;
6249 PyObject *errorHandler = NULL;
6250 PyObject *exc = NULL;
6251
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006253 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 1))
6255 return NULL;
6256
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006257 if (size == 0)
6258 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006259
Victor Stinner8f674cc2013-04-17 23:02:17 +02006260 _PyUnicodeWriter_Init(&writer);
6261 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6262 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006264 }
6265 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006266
Victor Stinner8f674cc2013-04-17 23:02:17 +02006267 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006269 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006270 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006271 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006272 endinpos = end-starts;
6273 reason = "truncated input";
6274 goto error;
6275 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006276 /* We copy the raw representation one byte at a time because the
6277 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006278 ((char *) &uch)[0] = s[0];
6279 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006280#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006281 ((char *) &uch)[2] = s[2];
6282 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006283#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006284 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006285#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 /* We have to sanity check the raw data, otherwise doom looms for
6287 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006288 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006289 endinpos = s - starts + Py_UNICODE_SIZE;
6290 reason = "illegal code point (> 0x10FFFF)";
6291 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006292 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006293#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294 s += Py_UNICODE_SIZE;
6295#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006296 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006297 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006298 Py_UNICODE uch2;
6299 ((char *) &uch2)[0] = s[0];
6300 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006301 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006302 {
Victor Stinner551ac952011-11-29 22:58:13 +01006303 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305 }
6306 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006307#endif
6308
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006309 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006310 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006311 continue;
6312
6313 error:
6314 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006315 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006316 errors, &errorHandler,
6317 "unicode_internal", reason,
6318 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006319 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006320 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006321 }
6322
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 Py_XDECREF(errorHandler);
6324 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006325 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006328 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 Py_XDECREF(errorHandler);
6330 Py_XDECREF(exc);
6331 return NULL;
6332}
6333
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334/* --- Latin-1 Codec ------------------------------------------------------ */
6335
Alexander Belopolsky40018472011-02-26 01:02:56 +00006336PyObject *
6337PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006338 Py_ssize_t size,
6339 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006342 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006346static void
6347make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006348 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006349 PyObject *unicode,
6350 Py_ssize_t startpos, Py_ssize_t endpos,
6351 const char *reason)
6352{
6353 if (*exceptionObject == NULL) {
6354 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006355 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006356 encoding, unicode, startpos, endpos, reason);
6357 }
6358 else {
6359 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6360 goto onError;
6361 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6362 goto onError;
6363 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6364 goto onError;
6365 return;
6366 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006367 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006368 }
6369}
6370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006372static void
6373raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006374 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006375 PyObject *unicode,
6376 Py_ssize_t startpos, Py_ssize_t endpos,
6377 const char *reason)
6378{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006379 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006380 encoding, unicode, startpos, endpos, reason);
6381 if (*exceptionObject != NULL)
6382 PyCodec_StrictErrors(*exceptionObject);
6383}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384
6385/* error handling callback helper:
6386 build arguments, call the callback and check the arguments,
6387 put the result into newpos and return the replacement string, which
6388 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006389static PyObject *
6390unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006391 PyObject **errorHandler,
6392 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006394 Py_ssize_t startpos, Py_ssize_t endpos,
6395 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006397 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 PyObject *restuple;
6400 PyObject *resunicode;
6401
6402 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 }
6407
Benjamin Petersonbac79492012-01-14 13:34:47 -05006408 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 return NULL;
6410 len = PyUnicode_GET_LENGTH(unicode);
6411
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006412 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006413 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416
6417 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006422 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 Py_DECREF(restuple);
6424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006426 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 &resunicode, newpos)) {
6428 Py_DECREF(restuple);
6429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006431 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6432 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6433 Py_DECREF(restuple);
6434 return NULL;
6435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006437 *newpos = len + *newpos;
6438 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6440 Py_DECREF(restuple);
6441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 Py_INCREF(resunicode);
6444 Py_DECREF(restuple);
6445 return resunicode;
6446}
6447
Alexander Belopolsky40018472011-02-26 01:02:56 +00006448static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006450 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006451 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453 /* input state */
6454 Py_ssize_t pos=0, size;
6455 int kind;
6456 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 /* output object */
6458 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 /* pointer into the output */
6460 char *str;
6461 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006463 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6464 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 PyObject *errorHandler = NULL;
6466 PyObject *exc = NULL;
6467 /* the following variable is used for caching string comparisons
6468 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6469 int known_errorHandler = -1;
6470
Benjamin Petersonbac79492012-01-14 13:34:47 -05006471 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 return NULL;
6473 size = PyUnicode_GET_LENGTH(unicode);
6474 kind = PyUnicode_KIND(unicode);
6475 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 /* allocate enough for a simple encoding without
6477 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006478 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006479 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006480 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006481 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006482 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006483 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 ressize = size;
6485
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 while (pos < size) {
6487 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 /* can we encode this? */
6490 if (c<limit) {
6491 /* no overflow check, because we know that the space is enough */
6492 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006494 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 Py_ssize_t requiredsize;
6497 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 Py_ssize_t collstart = pos;
6501 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 ++collend;
6505 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6506 if (known_errorHandler==-1) {
6507 if ((errors==NULL) || (!strcmp(errors, "strict")))
6508 known_errorHandler = 1;
6509 else if (!strcmp(errors, "replace"))
6510 known_errorHandler = 2;
6511 else if (!strcmp(errors, "ignore"))
6512 known_errorHandler = 3;
6513 else if (!strcmp(errors, "xmlcharrefreplace"))
6514 known_errorHandler = 4;
6515 else
6516 known_errorHandler = 0;
6517 }
6518 switch (known_errorHandler) {
6519 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006520 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 goto onError;
6522 case 2: /* replace */
6523 while (collstart++<collend)
6524 *str++ = '?'; /* fall through */
6525 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 break;
6528 case 4: /* xmlcharrefreplace */
6529 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 /* determine replacement size */
6531 for (i = collstart, repsize = 0; i < collend; ++i) {
6532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6533 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006545 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006546 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006550 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 if (requiredsize > ressize) {
6552 if (requiredsize<2*ressize)
6553 requiredsize = 2*ressize;
6554 if (_PyBytes_Resize(&res, requiredsize))
6555 goto onError;
6556 str = PyBytes_AS_STRING(res) + respos;
6557 ressize = requiredsize;
6558 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 /* generate replacement */
6560 for (i = collstart; i < collend; ++i) {
6561 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006563 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 break;
6565 default:
6566 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006567 encoding, reason, unicode, &exc,
6568 collstart, collend, &newpos);
6569 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006570 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006572 if (PyBytes_Check(repunicode)) {
6573 /* Directly copy bytes result to output. */
6574 repsize = PyBytes_Size(repunicode);
6575 if (repsize > 1) {
6576 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006577 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006578 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6579 Py_DECREF(repunicode);
6580 goto onError;
6581 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006582 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006583 ressize += repsize-1;
6584 }
6585 memcpy(str, PyBytes_AsString(repunicode), repsize);
6586 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006588 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006589 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* need more space? (at least enough for what we
6592 have+the replacement+the rest of the string, so
6593 we won't have to check space for encodable characters) */
6594 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595 repsize = PyUnicode_GET_LENGTH(repunicode);
6596 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 if (requiredsize > ressize) {
6598 if (requiredsize<2*ressize)
6599 requiredsize = 2*ressize;
6600 if (_PyBytes_Resize(&res, requiredsize)) {
6601 Py_DECREF(repunicode);
6602 goto onError;
6603 }
6604 str = PyBytes_AS_STRING(res) + respos;
6605 ressize = requiredsize;
6606 }
6607 /* check if there is anything unencodable in the replacement
6608 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006609 for (i = 0; repsize-->0; ++i, ++str) {
6610 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006612 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 Py_DECREF(repunicode);
6615 goto onError;
6616 }
6617 *str = (char)c;
6618 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006622 }
6623 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006624 /* Resize if we allocated to much */
6625 size = str - PyBytes_AS_STRING(res);
6626 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006627 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006628 if (_PyBytes_Resize(&res, size) < 0)
6629 goto onError;
6630 }
6631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 Py_XDECREF(errorHandler);
6633 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006634 return res;
6635
6636 onError:
6637 Py_XDECREF(res);
6638 Py_XDECREF(errorHandler);
6639 Py_XDECREF(exc);
6640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641}
6642
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644PyObject *
6645PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 Py_ssize_t size,
6647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 PyObject *result;
6650 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6651 if (unicode == NULL)
6652 return NULL;
6653 result = unicode_encode_ucs1(unicode, errors, 256);
6654 Py_DECREF(unicode);
6655 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656}
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
6661 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 PyErr_BadArgument();
6663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665 if (PyUnicode_READY(unicode) == -1)
6666 return NULL;
6667 /* Fast path: if it is a one-byte string, construct
6668 bytes object directly. */
6669 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6670 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6671 PyUnicode_GET_LENGTH(unicode));
6672 /* Non-Latin-1 characters present. Defer to above function to
6673 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675}
6676
6677PyObject*
6678PyUnicode_AsLatin1String(PyObject *unicode)
6679{
6680 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
6683/* --- 7-bit ASCII Codec -------------------------------------------------- */
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685PyObject *
6686PyUnicode_DecodeASCII(const char *s,
6687 Py_ssize_t size,
6688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006691 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006692 int kind;
6693 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t startinpos;
6695 Py_ssize_t endinpos;
6696 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 const char *e;
6698 PyObject *errorHandler = NULL;
6699 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006702 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006705 if (size == 1 && (unsigned char)s[0] < 128)
6706 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006707
Victor Stinner8f674cc2013-04-17 23:02:17 +02006708 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006709 writer.min_length = size;
6710 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006711 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006714 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006715 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 writer.pos = outpos;
6717 if (writer.pos == size)
6718 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006719
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006720 s += writer.pos;
6721 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006723 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006725 PyUnicode_WRITE(kind, data, writer.pos, c);
6726 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 ++s;
6728 }
6729 else {
6730 startinpos = s-starts;
6731 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006732 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 errors, &errorHandler,
6734 "ascii", "ordinal not in range(128)",
6735 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006736 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 kind = writer.kind;
6739 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 Py_XDECREF(errorHandler);
6743 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006744 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006745
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006747 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 Py_XDECREF(errorHandler);
6749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
6751}
6752
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006754PyObject *
6755PyUnicode_EncodeASCII(const Py_UNICODE *p,
6756 Py_ssize_t size,
6757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 PyObject *result;
6760 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6761 if (unicode == NULL)
6762 return NULL;
6763 result = unicode_encode_ucs1(unicode, errors, 128);
6764 Py_DECREF(unicode);
6765 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766}
6767
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006769_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 PyErr_BadArgument();
6773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 if (PyUnicode_READY(unicode) == -1)
6776 return NULL;
6777 /* Fast path: if it is an ASCII-only string, construct bytes object
6778 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006779 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6781 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783}
6784
6785PyObject *
6786PyUnicode_AsASCIIString(PyObject *unicode)
6787{
6788 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Victor Stinner99b95382011-07-04 14:23:54 +02006791#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006793/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006795#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796#define NEED_RETRY
6797#endif
6798
Victor Stinner3a50e702011-10-18 21:21:00 +02006799#ifndef WC_ERR_INVALID_CHARS
6800# define WC_ERR_INVALID_CHARS 0x0080
6801#endif
6802
6803static char*
6804code_page_name(UINT code_page, PyObject **obj)
6805{
6806 *obj = NULL;
6807 if (code_page == CP_ACP)
6808 return "mbcs";
6809 if (code_page == CP_UTF7)
6810 return "CP_UTF7";
6811 if (code_page == CP_UTF8)
6812 return "CP_UTF8";
6813
6814 *obj = PyBytes_FromFormat("cp%u", code_page);
6815 if (*obj == NULL)
6816 return NULL;
6817 return PyBytes_AS_STRING(*obj);
6818}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
Victor Stinner3a50e702011-10-18 21:21:00 +02006820static DWORD
6821decode_code_page_flags(UINT code_page)
6822{
6823 if (code_page == CP_UTF7) {
6824 /* The CP_UTF7 decoder only supports flags=0 */
6825 return 0;
6826 }
6827 else
6828 return MB_ERR_INVALID_CHARS;
6829}
6830
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006832 * Decode a byte string from a Windows code page into unicode object in strict
6833 * mode.
6834 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006835 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6836 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006838static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006839decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006840 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006841 const char *in,
6842 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843{
Victor Stinner3a50e702011-10-18 21:21:00 +02006844 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006845 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847
6848 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006849 assert(insize > 0);
6850 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6851 if (outsize <= 0)
6852 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
6854 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006856 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006857 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 if (*v == NULL)
6859 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006860 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861 }
6862 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006865 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 }
6869
6870 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6872 if (outsize <= 0)
6873 goto error;
6874 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006875
Victor Stinner3a50e702011-10-18 21:21:00 +02006876error:
6877 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6878 return -2;
6879 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006880 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881}
6882
Victor Stinner3a50e702011-10-18 21:21:00 +02006883/*
6884 * Decode a byte string from a code page into unicode object with an error
6885 * handler.
6886 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006887 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 * UnicodeDecodeError exception and returns -1 on error.
6889 */
6890static int
6891decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 PyObject **v,
6893 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006894 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006895{
6896 const char *startin = in;
6897 const char *endin = in + size;
6898 const DWORD flags = decode_code_page_flags(code_page);
6899 /* Ideally, we should get reason from FormatMessage. This is the Windows
6900 2000 English version of the message. */
6901 const char *reason = "No mapping for the Unicode character exists "
6902 "in the target code page.";
6903 /* each step cannot decode more than 1 character, but a character can be
6904 represented as a surrogate pair */
6905 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006906 int insize;
6907 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 PyObject *errorHandler = NULL;
6909 PyObject *exc = NULL;
6910 PyObject *encoding_obj = NULL;
6911 char *encoding;
6912 DWORD err;
6913 int ret = -1;
6914
6915 assert(size > 0);
6916
6917 encoding = code_page_name(code_page, &encoding_obj);
6918 if (encoding == NULL)
6919 return -1;
6920
Victor Stinner7d00cc12014-03-17 23:08:06 +01006921 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006922 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6923 UnicodeDecodeError. */
6924 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6925 if (exc != NULL) {
6926 PyCodec_StrictErrors(exc);
6927 Py_CLEAR(exc);
6928 }
6929 goto error;
6930 }
6931
6932 if (*v == NULL) {
6933 /* Create unicode object */
6934 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6935 PyErr_NoMemory();
6936 goto error;
6937 }
Victor Stinnerab595942011-12-17 04:59:06 +01006938 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006939 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 if (*v == NULL)
6941 goto error;
6942 startout = PyUnicode_AS_UNICODE(*v);
6943 }
6944 else {
6945 /* Extend unicode object */
6946 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6947 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6948 PyErr_NoMemory();
6949 goto error;
6950 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006951 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006952 goto error;
6953 startout = PyUnicode_AS_UNICODE(*v) + n;
6954 }
6955
6956 /* Decode the byte string character per character */
6957 out = startout;
6958 while (in < endin)
6959 {
6960 /* Decode a character */
6961 insize = 1;
6962 do
6963 {
6964 outsize = MultiByteToWideChar(code_page, flags,
6965 in, insize,
6966 buffer, Py_ARRAY_LENGTH(buffer));
6967 if (outsize > 0)
6968 break;
6969 err = GetLastError();
6970 if (err != ERROR_NO_UNICODE_TRANSLATION
6971 && err != ERROR_INSUFFICIENT_BUFFER)
6972 {
6973 PyErr_SetFromWindowsErr(0);
6974 goto error;
6975 }
6976 insize++;
6977 }
6978 /* 4=maximum length of a UTF-8 sequence */
6979 while (insize <= 4 && (in + insize) <= endin);
6980
6981 if (outsize <= 0) {
6982 Py_ssize_t startinpos, endinpos, outpos;
6983
Victor Stinner7d00cc12014-03-17 23:08:06 +01006984 /* last character in partial decode? */
6985 if (in + insize >= endin && !final)
6986 break;
6987
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 startinpos = in - startin;
6989 endinpos = startinpos + 1;
6990 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006991 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 errors, &errorHandler,
6993 encoding, reason,
6994 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006995 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 {
6997 goto error;
6998 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006999 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 }
7001 else {
7002 in += insize;
7003 memcpy(out, buffer, outsize * sizeof(wchar_t));
7004 out += outsize;
7005 }
7006 }
7007
7008 /* write a NUL character at the end */
7009 *out = 0;
7010
7011 /* Extend unicode object */
7012 outsize = out - startout;
7013 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007014 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 goto error;
Victor Stinner7d00cc12014-03-17 23:08:06 +01007016 ret = in - startin;
Victor Stinner3a50e702011-10-18 21:21:00 +02007017
7018error:
7019 Py_XDECREF(encoding_obj);
7020 Py_XDECREF(errorHandler);
7021 Py_XDECREF(exc);
7022 return ret;
7023}
7024
Victor Stinner3a50e702011-10-18 21:21:00 +02007025static PyObject *
7026decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007027 const char *s, Py_ssize_t size,
7028 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029{
Victor Stinner76a31a62011-11-04 00:05:13 +01007030 PyObject *v = NULL;
7031 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
Victor Stinner3a50e702011-10-18 21:21:00 +02007033 if (code_page < 0) {
7034 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7035 return NULL;
7036 }
7037
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040
Victor Stinner76a31a62011-11-04 00:05:13 +01007041 do
7042 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007043#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007044 if (size > INT_MAX) {
7045 chunk_size = INT_MAX;
7046 final = 0;
7047 done = 0;
7048 }
7049 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007051 {
7052 chunk_size = (int)size;
7053 final = (consumed == NULL);
7054 done = 1;
7055 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056
Victor Stinner76a31a62011-11-04 00:05:13 +01007057 if (chunk_size == 0 && done) {
7058 if (v != NULL)
7059 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007061 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062
Victor Stinner76a31a62011-11-04 00:05:13 +01007063 converted = decode_code_page_strict(code_page, &v,
7064 s, chunk_size);
7065 if (converted == -2)
7066 converted = decode_code_page_errors(code_page, &v,
7067 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007068 errors, final);
7069 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007070
7071 if (converted < 0) {
7072 Py_XDECREF(v);
7073 return NULL;
7074 }
7075
7076 if (consumed)
7077 *consumed += converted;
7078
7079 s += converted;
7080 size -= converted;
7081 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007083 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084}
7085
Alexander Belopolsky40018472011-02-26 01:02:56 +00007086PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007087PyUnicode_DecodeCodePageStateful(int code_page,
7088 const char *s,
7089 Py_ssize_t size,
7090 const char *errors,
7091 Py_ssize_t *consumed)
7092{
7093 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7094}
7095
7096PyObject *
7097PyUnicode_DecodeMBCSStateful(const char *s,
7098 Py_ssize_t size,
7099 const char *errors,
7100 Py_ssize_t *consumed)
7101{
7102 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7103}
7104
7105PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007106PyUnicode_DecodeMBCS(const char *s,
7107 Py_ssize_t size,
7108 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007109{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7111}
7112
Victor Stinner3a50e702011-10-18 21:21:00 +02007113static DWORD
7114encode_code_page_flags(UINT code_page, const char *errors)
7115{
7116 if (code_page == CP_UTF8) {
7117 if (winver.dwMajorVersion >= 6)
7118 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7119 and later */
7120 return WC_ERR_INVALID_CHARS;
7121 else
7122 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7123 return 0;
7124 }
7125 else if (code_page == CP_UTF7) {
7126 /* CP_UTF7 only supports flags=0 */
7127 return 0;
7128 }
7129 else {
7130 if (errors != NULL && strcmp(errors, "replace") == 0)
7131 return 0;
7132 else
7133 return WC_NO_BEST_FIT_CHARS;
7134 }
7135}
7136
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 * Encode a Unicode string to a Windows code page into a byte string in strict
7139 * mode.
7140 *
7141 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007142 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007144static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007145encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007146 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148{
Victor Stinner554f3f02010-06-16 23:33:54 +00007149 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 BOOL *pusedDefaultChar = &usedDefaultChar;
7151 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007152 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007153 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007154 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 const DWORD flags = encode_code_page_flags(code_page, NULL);
7156 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007157 /* Create a substring so that we can get the UTF-16 representation
7158 of just the slice under consideration. */
7159 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160
Martin v. Löwis3d325192011-11-04 18:23:06 +01007161 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007162
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007164 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007166 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007167
Victor Stinner2fc507f2011-11-04 20:06:39 +01007168 substring = PyUnicode_Substring(unicode, offset, offset+len);
7169 if (substring == NULL)
7170 return -1;
7171 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7172 if (p == NULL) {
7173 Py_DECREF(substring);
7174 return -1;
7175 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007176 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007177
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007178 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007180 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 NULL, 0,
7182 NULL, pusedDefaultChar);
7183 if (outsize <= 0)
7184 goto error;
7185 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007186 if (pusedDefaultChar && *pusedDefaultChar) {
7187 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007189 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007190
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194 if (*outbytes == NULL) {
7195 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007197 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199 }
7200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 const Py_ssize_t n = PyBytes_Size(*outbytes);
7203 if (outsize > PY_SSIZE_T_MAX - n) {
7204 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007208 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7209 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007211 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213 }
7214
7215 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007217 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 out, outsize,
7219 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 if (outsize <= 0)
7222 goto error;
7223 if (pusedDefaultChar && *pusedDefaultChar)
7224 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007225 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007226
Victor Stinner3a50e702011-10-18 21:21:00 +02007227error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007228 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7230 return -2;
7231 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007232 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007233}
7234
Victor Stinner3a50e702011-10-18 21:21:00 +02007235/*
7236 * Encode a Unicode string to a Windows code page into a byte string using a
7237 * error handler.
7238 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007239 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 * -1 on other error.
7241 */
7242static int
7243encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007244 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007245 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007246{
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007248 Py_ssize_t pos = unicode_offset;
7249 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 /* Ideally, we should get reason from FormatMessage. This is the Windows
7251 2000 English version of the message. */
7252 const char *reason = "invalid character";
7253 /* 4=maximum length of a UTF-8 sequence */
7254 char buffer[4];
7255 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7256 Py_ssize_t outsize;
7257 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 PyObject *errorHandler = NULL;
7259 PyObject *exc = NULL;
7260 PyObject *encoding_obj = NULL;
7261 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007262 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 PyObject *rep;
7264 int ret = -1;
7265
7266 assert(insize > 0);
7267
7268 encoding = code_page_name(code_page, &encoding_obj);
7269 if (encoding == NULL)
7270 return -1;
7271
7272 if (errors == NULL || strcmp(errors, "strict") == 0) {
7273 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7274 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007275 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 if (exc != NULL) {
7277 PyCodec_StrictErrors(exc);
7278 Py_DECREF(exc);
7279 }
7280 Py_XDECREF(encoding_obj);
7281 return -1;
7282 }
7283
7284 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7285 pusedDefaultChar = &usedDefaultChar;
7286 else
7287 pusedDefaultChar = NULL;
7288
7289 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7290 PyErr_NoMemory();
7291 goto error;
7292 }
7293 outsize = insize * Py_ARRAY_LENGTH(buffer);
7294
7295 if (*outbytes == NULL) {
7296 /* Create string object */
7297 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7298 if (*outbytes == NULL)
7299 goto error;
7300 out = PyBytes_AS_STRING(*outbytes);
7301 }
7302 else {
7303 /* Extend string object */
7304 Py_ssize_t n = PyBytes_Size(*outbytes);
7305 if (n > PY_SSIZE_T_MAX - outsize) {
7306 PyErr_NoMemory();
7307 goto error;
7308 }
7309 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7310 goto error;
7311 out = PyBytes_AS_STRING(*outbytes) + n;
7312 }
7313
7314 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007315 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007317 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7318 wchar_t chars[2];
7319 int charsize;
7320 if (ch < 0x10000) {
7321 chars[0] = (wchar_t)ch;
7322 charsize = 1;
7323 }
7324 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007325 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7326 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007327 charsize = 2;
7328 }
7329
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007331 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 buffer, Py_ARRAY_LENGTH(buffer),
7333 NULL, pusedDefaultChar);
7334 if (outsize > 0) {
7335 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7336 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007337 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 memcpy(out, buffer, outsize);
7339 out += outsize;
7340 continue;
7341 }
7342 }
7343 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7344 PyErr_SetFromWindowsErr(0);
7345 goto error;
7346 }
7347
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 rep = unicode_encode_call_errorhandler(
7349 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007350 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007351 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 if (rep == NULL)
7353 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007355
7356 if (PyBytes_Check(rep)) {
7357 outsize = PyBytes_GET_SIZE(rep);
7358 if (outsize != 1) {
7359 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7360 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7361 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7362 Py_DECREF(rep);
7363 goto error;
7364 }
7365 out = PyBytes_AS_STRING(*outbytes) + offset;
7366 }
7367 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7368 out += outsize;
7369 }
7370 else {
7371 Py_ssize_t i;
7372 enum PyUnicode_Kind kind;
7373 void *data;
7374
Benjamin Petersonbac79492012-01-14 13:34:47 -05007375 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 Py_DECREF(rep);
7377 goto error;
7378 }
7379
7380 outsize = PyUnicode_GET_LENGTH(rep);
7381 if (outsize != 1) {
7382 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7383 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7384 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7385 Py_DECREF(rep);
7386 goto error;
7387 }
7388 out = PyBytes_AS_STRING(*outbytes) + offset;
7389 }
7390 kind = PyUnicode_KIND(rep);
7391 data = PyUnicode_DATA(rep);
7392 for (i=0; i < outsize; i++) {
7393 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7394 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007395 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007396 encoding, unicode,
7397 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 "unable to encode error handler result to ASCII");
7399 Py_DECREF(rep);
7400 goto error;
7401 }
7402 *out = (unsigned char)ch;
7403 out++;
7404 }
7405 }
7406 Py_DECREF(rep);
7407 }
7408 /* write a NUL byte */
7409 *out = 0;
7410 outsize = out - PyBytes_AS_STRING(*outbytes);
7411 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7412 if (_PyBytes_Resize(outbytes, outsize) < 0)
7413 goto error;
7414 ret = 0;
7415
7416error:
7417 Py_XDECREF(encoding_obj);
7418 Py_XDECREF(errorHandler);
7419 Py_XDECREF(exc);
7420 return ret;
7421}
7422
Victor Stinner3a50e702011-10-18 21:21:00 +02007423static PyObject *
7424encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007425 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 const char *errors)
7427{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007428 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007430 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007431 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007432
Benjamin Petersonbac79492012-01-14 13:34:47 -05007433 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007434 return NULL;
7435 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007436
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 if (code_page < 0) {
7438 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7439 return NULL;
7440 }
7441
Martin v. Löwis3d325192011-11-04 18:23:06 +01007442 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 return PyBytes_FromStringAndSize(NULL, 0);
7444
Victor Stinner7581cef2011-11-03 22:32:33 +01007445 offset = 0;
7446 do
7447 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007449 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450 chunks. */
7451 if (len > INT_MAX/2) {
7452 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007453 done = 0;
7454 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007455 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007457 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007458 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007459 done = 1;
7460 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007463 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007464 errors);
7465 if (ret == -2)
7466 ret = encode_code_page_errors(code_page, &outbytes,
7467 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007468 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007469 if (ret < 0) {
7470 Py_XDECREF(outbytes);
7471 return NULL;
7472 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473
Victor Stinner7581cef2011-11-03 22:32:33 +01007474 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 return outbytes;
7479}
7480
7481PyObject *
7482PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7483 Py_ssize_t size,
7484 const char *errors)
7485{
Victor Stinner7581cef2011-11-03 22:32:33 +01007486 PyObject *unicode, *res;
7487 unicode = PyUnicode_FromUnicode(p, size);
7488 if (unicode == NULL)
7489 return NULL;
7490 res = encode_code_page(CP_ACP, unicode, errors);
7491 Py_DECREF(unicode);
7492 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007493}
7494
7495PyObject *
7496PyUnicode_EncodeCodePage(int code_page,
7497 PyObject *unicode,
7498 const char *errors)
7499{
Victor Stinner7581cef2011-11-03 22:32:33 +01007500 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007501}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007502
Alexander Belopolsky40018472011-02-26 01:02:56 +00007503PyObject *
7504PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007505{
7506 if (!PyUnicode_Check(unicode)) {
7507 PyErr_BadArgument();
7508 return NULL;
7509 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007510 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007511}
7512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007513#undef NEED_RETRY
7514
Victor Stinner99b95382011-07-04 14:23:54 +02007515#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007516
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517/* --- Character Mapping Codec -------------------------------------------- */
7518
Victor Stinnerfb161b12013-04-18 01:44:27 +02007519static int
7520charmap_decode_string(const char *s,
7521 Py_ssize_t size,
7522 PyObject *mapping,
7523 const char *errors,
7524 _PyUnicodeWriter *writer)
7525{
7526 const char *starts = s;
7527 const char *e;
7528 Py_ssize_t startinpos, endinpos;
7529 PyObject *errorHandler = NULL, *exc = NULL;
7530 Py_ssize_t maplen;
7531 enum PyUnicode_Kind mapkind;
7532 void *mapdata;
7533 Py_UCS4 x;
7534 unsigned char ch;
7535
7536 if (PyUnicode_READY(mapping) == -1)
7537 return -1;
7538
7539 maplen = PyUnicode_GET_LENGTH(mapping);
7540 mapdata = PyUnicode_DATA(mapping);
7541 mapkind = PyUnicode_KIND(mapping);
7542
7543 e = s + size;
7544
7545 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7546 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7547 * is disabled in encoding aliases, latin1 is preferred because
7548 * its implementation is faster. */
7549 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7550 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7551 Py_UCS4 maxchar = writer->maxchar;
7552
7553 assert (writer->kind == PyUnicode_1BYTE_KIND);
7554 while (s < e) {
7555 ch = *s;
7556 x = mapdata_ucs1[ch];
7557 if (x > maxchar) {
7558 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7559 goto onError;
7560 maxchar = writer->maxchar;
7561 outdata = (Py_UCS1 *)writer->data;
7562 }
7563 outdata[writer->pos] = x;
7564 writer->pos++;
7565 ++s;
7566 }
7567 return 0;
7568 }
7569
7570 while (s < e) {
7571 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7572 enum PyUnicode_Kind outkind = writer->kind;
7573 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7574 if (outkind == PyUnicode_1BYTE_KIND) {
7575 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7576 Py_UCS4 maxchar = writer->maxchar;
7577 while (s < e) {
7578 ch = *s;
7579 x = mapdata_ucs2[ch];
7580 if (x > maxchar)
7581 goto Error;
7582 outdata[writer->pos] = x;
7583 writer->pos++;
7584 ++s;
7585 }
7586 break;
7587 }
7588 else if (outkind == PyUnicode_2BYTE_KIND) {
7589 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7590 while (s < e) {
7591 ch = *s;
7592 x = mapdata_ucs2[ch];
7593 if (x == 0xFFFE)
7594 goto Error;
7595 outdata[writer->pos] = x;
7596 writer->pos++;
7597 ++s;
7598 }
7599 break;
7600 }
7601 }
7602 ch = *s;
7603
7604 if (ch < maplen)
7605 x = PyUnicode_READ(mapkind, mapdata, ch);
7606 else
7607 x = 0xfffe; /* invalid value */
7608Error:
7609 if (x == 0xfffe)
7610 {
7611 /* undefined mapping */
7612 startinpos = s-starts;
7613 endinpos = startinpos+1;
7614 if (unicode_decode_call_errorhandler_writer(
7615 errors, &errorHandler,
7616 "charmap", "character maps to <undefined>",
7617 &starts, &e, &startinpos, &endinpos, &exc, &s,
7618 writer)) {
7619 goto onError;
7620 }
7621 continue;
7622 }
7623
7624 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7625 goto onError;
7626 ++s;
7627 }
7628 Py_XDECREF(errorHandler);
7629 Py_XDECREF(exc);
7630 return 0;
7631
7632onError:
7633 Py_XDECREF(errorHandler);
7634 Py_XDECREF(exc);
7635 return -1;
7636}
7637
7638static int
7639charmap_decode_mapping(const char *s,
7640 Py_ssize_t size,
7641 PyObject *mapping,
7642 const char *errors,
7643 _PyUnicodeWriter *writer)
7644{
7645 const char *starts = s;
7646 const char *e;
7647 Py_ssize_t startinpos, endinpos;
7648 PyObject *errorHandler = NULL, *exc = NULL;
7649 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007650 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007651
7652 e = s + size;
7653
7654 while (s < e) {
7655 ch = *s;
7656
7657 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7658 key = PyLong_FromLong((long)ch);
7659 if (key == NULL)
7660 goto onError;
7661
7662 item = PyObject_GetItem(mapping, key);
7663 Py_DECREF(key);
7664 if (item == NULL) {
7665 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7666 /* No mapping found means: mapping is undefined. */
7667 PyErr_Clear();
7668 goto Undefined;
7669 } else
7670 goto onError;
7671 }
7672
7673 /* Apply mapping */
7674 if (item == Py_None)
7675 goto Undefined;
7676 if (PyLong_Check(item)) {
7677 long value = PyLong_AS_LONG(item);
7678 if (value == 0xFFFE)
7679 goto Undefined;
7680 if (value < 0 || value > MAX_UNICODE) {
7681 PyErr_Format(PyExc_TypeError,
7682 "character mapping must be in range(0x%lx)",
7683 (unsigned long)MAX_UNICODE + 1);
7684 goto onError;
7685 }
7686
7687 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7688 goto onError;
7689 }
7690 else if (PyUnicode_Check(item)) {
7691 if (PyUnicode_READY(item) == -1)
7692 goto onError;
7693 if (PyUnicode_GET_LENGTH(item) == 1) {
7694 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7695 if (value == 0xFFFE)
7696 goto Undefined;
7697 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7698 goto onError;
7699 }
7700 else {
7701 writer->overallocate = 1;
7702 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7703 goto onError;
7704 }
7705 }
7706 else {
7707 /* wrong return value */
7708 PyErr_SetString(PyExc_TypeError,
7709 "character mapping must return integer, None or str");
7710 goto onError;
7711 }
7712 Py_CLEAR(item);
7713 ++s;
7714 continue;
7715
7716Undefined:
7717 /* undefined mapping */
7718 Py_CLEAR(item);
7719 startinpos = s-starts;
7720 endinpos = startinpos+1;
7721 if (unicode_decode_call_errorhandler_writer(
7722 errors, &errorHandler,
7723 "charmap", "character maps to <undefined>",
7724 &starts, &e, &startinpos, &endinpos, &exc, &s,
7725 writer)) {
7726 goto onError;
7727 }
7728 }
7729 Py_XDECREF(errorHandler);
7730 Py_XDECREF(exc);
7731 return 0;
7732
7733onError:
7734 Py_XDECREF(item);
7735 Py_XDECREF(errorHandler);
7736 Py_XDECREF(exc);
7737 return -1;
7738}
7739
Alexander Belopolsky40018472011-02-26 01:02:56 +00007740PyObject *
7741PyUnicode_DecodeCharmap(const char *s,
7742 Py_ssize_t size,
7743 PyObject *mapping,
7744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007746 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007747
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 /* Default to Latin-1 */
7749 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007753 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007754 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007755 writer.min_length = size;
7756 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007758
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007759 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007760 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7761 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007762 }
7763 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007764 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7765 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007767 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007768
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007770 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 return NULL;
7772}
7773
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774/* Charmap encoding: the lookup table */
7775
Alexander Belopolsky40018472011-02-26 01:02:56 +00007776struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 PyObject_HEAD
7778 unsigned char level1[32];
7779 int count2, count3;
7780 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007781};
7782
7783static PyObject*
7784encoding_map_size(PyObject *obj, PyObject* args)
7785{
7786 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007789}
7790
7791static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 PyDoc_STR("Return the size (in bytes) of this object") },
7794 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007795};
7796
7797static void
7798encoding_map_dealloc(PyObject* o)
7799{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007800 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007801}
7802
7803static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007804 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 "EncodingMap", /*tp_name*/
7806 sizeof(struct encoding_map), /*tp_basicsize*/
7807 0, /*tp_itemsize*/
7808 /* methods */
7809 encoding_map_dealloc, /*tp_dealloc*/
7810 0, /*tp_print*/
7811 0, /*tp_getattr*/
7812 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007813 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 0, /*tp_repr*/
7815 0, /*tp_as_number*/
7816 0, /*tp_as_sequence*/
7817 0, /*tp_as_mapping*/
7818 0, /*tp_hash*/
7819 0, /*tp_call*/
7820 0, /*tp_str*/
7821 0, /*tp_getattro*/
7822 0, /*tp_setattro*/
7823 0, /*tp_as_buffer*/
7824 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7825 0, /*tp_doc*/
7826 0, /*tp_traverse*/
7827 0, /*tp_clear*/
7828 0, /*tp_richcompare*/
7829 0, /*tp_weaklistoffset*/
7830 0, /*tp_iter*/
7831 0, /*tp_iternext*/
7832 encoding_map_methods, /*tp_methods*/
7833 0, /*tp_members*/
7834 0, /*tp_getset*/
7835 0, /*tp_base*/
7836 0, /*tp_dict*/
7837 0, /*tp_descr_get*/
7838 0, /*tp_descr_set*/
7839 0, /*tp_dictoffset*/
7840 0, /*tp_init*/
7841 0, /*tp_alloc*/
7842 0, /*tp_new*/
7843 0, /*tp_free*/
7844 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845};
7846
7847PyObject*
7848PyUnicode_BuildEncodingMap(PyObject* string)
7849{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 PyObject *result;
7851 struct encoding_map *mresult;
7852 int i;
7853 int need_dict = 0;
7854 unsigned char level1[32];
7855 unsigned char level2[512];
7856 unsigned char *mlevel1, *mlevel2, *mlevel3;
7857 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007858 int kind;
7859 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007860 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007863 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007864 PyErr_BadArgument();
7865 return NULL;
7866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007867 kind = PyUnicode_KIND(string);
7868 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007869 length = PyUnicode_GET_LENGTH(string);
7870 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871 memset(level1, 0xFF, sizeof level1);
7872 memset(level2, 0xFF, sizeof level2);
7873
7874 /* If there isn't a one-to-one mapping of NULL to \0,
7875 or if there are non-BMP characters, we need to use
7876 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007879 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 ch = PyUnicode_READ(kind, data, i);
7882 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 need_dict = 1;
7884 break;
7885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007886 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 /* unmapped character */
7888 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 l1 = ch >> 11;
7890 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 if (level1[l1] == 0xFF)
7892 level1[l1] = count2++;
7893 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007894 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895 }
7896
7897 if (count2 >= 0xFF || count3 >= 0xFF)
7898 need_dict = 1;
7899
7900 if (need_dict) {
7901 PyObject *result = PyDict_New();
7902 PyObject *key, *value;
7903 if (!result)
7904 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007905 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007906 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007907 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007908 if (!key || !value)
7909 goto failed1;
7910 if (PyDict_SetItem(result, key, value) == -1)
7911 goto failed1;
7912 Py_DECREF(key);
7913 Py_DECREF(value);
7914 }
7915 return result;
7916 failed1:
7917 Py_XDECREF(key);
7918 Py_XDECREF(value);
7919 Py_DECREF(result);
7920 return NULL;
7921 }
7922
7923 /* Create a three-level trie */
7924 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7925 16*count2 + 128*count3 - 1);
7926 if (!result)
7927 return PyErr_NoMemory();
7928 PyObject_Init(result, &EncodingMapType);
7929 mresult = (struct encoding_map*)result;
7930 mresult->count2 = count2;
7931 mresult->count3 = count3;
7932 mlevel1 = mresult->level1;
7933 mlevel2 = mresult->level23;
7934 mlevel3 = mresult->level23 + 16*count2;
7935 memcpy(mlevel1, level1, 32);
7936 memset(mlevel2, 0xFF, 16*count2);
7937 memset(mlevel3, 0, 128*count3);
7938 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007939 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007940 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007941 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7942 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943 /* unmapped character */
7944 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007945 o1 = ch>>11;
7946 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 i2 = 16*mlevel1[o1] + o2;
7948 if (mlevel2[i2] == 0xFF)
7949 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007950 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007951 i3 = 128*mlevel2[i2] + o3;
7952 mlevel3[i3] = i;
7953 }
7954 return result;
7955}
7956
7957static int
Victor Stinner22168992011-11-20 17:09:18 +01007958encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959{
7960 struct encoding_map *map = (struct encoding_map*)mapping;
7961 int l1 = c>>11;
7962 int l2 = (c>>7) & 0xF;
7963 int l3 = c & 0x7F;
7964 int i;
7965
Victor Stinner22168992011-11-20 17:09:18 +01007966 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 if (c == 0)
7969 return 0;
7970 /* level 1*/
7971 i = map->level1[l1];
7972 if (i == 0xFF) {
7973 return -1;
7974 }
7975 /* level 2*/
7976 i = map->level23[16*i+l2];
7977 if (i == 0xFF) {
7978 return -1;
7979 }
7980 /* level 3 */
7981 i = map->level23[16*map->count2 + 128*i + l3];
7982 if (i == 0) {
7983 return -1;
7984 }
7985 return i;
7986}
7987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988/* Lookup the character ch in the mapping. If the character
7989 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007990 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007991static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007992charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993{
Christian Heimes217cfd12007-12-02 14:31:20 +00007994 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007995 PyObject *x;
7996
7997 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 x = PyObject_GetItem(mapping, w);
8000 Py_DECREF(w);
8001 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8003 /* No mapping found means: mapping is undefined. */
8004 PyErr_Clear();
8005 x = Py_None;
8006 Py_INCREF(x);
8007 return x;
8008 } else
8009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008011 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008013 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 long value = PyLong_AS_LONG(x);
8015 if (value < 0 || value > 255) {
8016 PyErr_SetString(PyExc_TypeError,
8017 "character mapping must be in range(256)");
8018 Py_DECREF(x);
8019 return NULL;
8020 }
8021 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008023 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 /* wrong return value */
8027 PyErr_Format(PyExc_TypeError,
8028 "character mapping must return integer, bytes or None, not %.400s",
8029 x->ob_type->tp_name);
8030 Py_DECREF(x);
8031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 }
8033}
8034
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008036charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8039 /* exponentially overallocate to minimize reallocations */
8040 if (requiredsize < 2*outsize)
8041 requiredsize = 2*outsize;
8042 if (_PyBytes_Resize(outobj, requiredsize))
8043 return -1;
8044 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045}
8046
Benjamin Peterson14339b62009-01-31 16:36:08 +00008047typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008049} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008050/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008051 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052 space is available. Return a new reference to the object that
8053 was put in the output buffer, or Py_None, if the mapping was undefined
8054 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008055 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008056static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008057charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008058 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008059{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060 PyObject *rep;
8061 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008062 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063
Christian Heimes90aa7642007-12-19 02:45:37 +00008064 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 if (res == -1)
8068 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 if (outsize<requiredsize)
8070 if (charmapencode_resize(outobj, outpos, requiredsize))
8071 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008072 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 outstart[(*outpos)++] = (char)res;
8074 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075 }
8076
8077 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 Py_DECREF(rep);
8082 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 if (PyLong_Check(rep)) {
8085 Py_ssize_t requiredsize = *outpos+1;
8086 if (outsize<requiredsize)
8087 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8088 Py_DECREF(rep);
8089 return enc_EXCEPTION;
8090 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008091 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 else {
8095 const char *repchars = PyBytes_AS_STRING(rep);
8096 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8097 Py_ssize_t requiredsize = *outpos+repsize;
8098 if (outsize<requiredsize)
8099 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8100 Py_DECREF(rep);
8101 return enc_EXCEPTION;
8102 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008103 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 memcpy(outstart + *outpos, repchars, repsize);
8105 *outpos += repsize;
8106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008107 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 Py_DECREF(rep);
8109 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110}
8111
8112/* handle an error in PyUnicode_EncodeCharmap
8113 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008114static int
8115charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008116 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008118 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008119 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120{
8121 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008122 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008123 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008124 enum PyUnicode_Kind kind;
8125 void *data;
8126 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 Py_ssize_t collstartpos = *inpos;
8129 Py_ssize_t collendpos = *inpos+1;
8130 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 char *encoding = "charmap";
8132 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008135 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136
Benjamin Petersonbac79492012-01-14 13:34:47 -05008137 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008138 return -1;
8139 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 /* find all unencodable characters */
8141 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008143 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008144 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008145 val = encoding_map_lookup(ch, mapping);
8146 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 break;
8148 ++collendpos;
8149 continue;
8150 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008151
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008152 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8153 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 if (rep==NULL)
8155 return -1;
8156 else if (rep!=Py_None) {
8157 Py_DECREF(rep);
8158 break;
8159 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 }
8163 /* cache callback name lookup
8164 * (if not done yet, i.e. it's the first error) */
8165 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 if ((errors==NULL) || (!strcmp(errors, "strict")))
8167 *known_errorHandler = 1;
8168 else if (!strcmp(errors, "replace"))
8169 *known_errorHandler = 2;
8170 else if (!strcmp(errors, "ignore"))
8171 *known_errorHandler = 3;
8172 else if (!strcmp(errors, "xmlcharrefreplace"))
8173 *known_errorHandler = 4;
8174 else
8175 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008176 }
8177 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008179 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 return -1;
8181 case 2: /* replace */
8182 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 x = charmapencode_output('?', mapping, res, respos);
8184 if (x==enc_EXCEPTION) {
8185 return -1;
8186 }
8187 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008188 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 return -1;
8190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 }
8192 /* fall through */
8193 case 3: /* ignore */
8194 *inpos = collendpos;
8195 break;
8196 case 4: /* xmlcharrefreplace */
8197 /* generate replacement (temporarily (mis)uses p) */
8198 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 char buffer[2+29+1+1];
8200 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008201 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 for (cp = buffer; *cp; ++cp) {
8203 x = charmapencode_output(*cp, mapping, res, respos);
8204 if (x==enc_EXCEPTION)
8205 return -1;
8206 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008207 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return -1;
8209 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 }
8211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 *inpos = collendpos;
8213 break;
8214 default:
8215 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008216 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008220 if (PyBytes_Check(repunicode)) {
8221 /* Directly copy bytes result to output. */
8222 Py_ssize_t outsize = PyBytes_Size(*res);
8223 Py_ssize_t requiredsize;
8224 repsize = PyBytes_Size(repunicode);
8225 requiredsize = *respos + repsize;
8226 if (requiredsize > outsize)
8227 /* Make room for all additional bytes. */
8228 if (charmapencode_resize(res, respos, requiredsize)) {
8229 Py_DECREF(repunicode);
8230 return -1;
8231 }
8232 memcpy(PyBytes_AsString(*res) + *respos,
8233 PyBytes_AsString(repunicode), repsize);
8234 *respos += repsize;
8235 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008236 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008237 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008238 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008239 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008240 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008241 Py_DECREF(repunicode);
8242 return -1;
8243 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008244 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008245 data = PyUnicode_DATA(repunicode);
8246 kind = PyUnicode_KIND(repunicode);
8247 for (index = 0; index < repsize; index++) {
8248 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8249 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008251 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 return -1;
8253 }
8254 else if (x==enc_FAILED) {
8255 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008256 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 return -1;
8258 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008259 }
8260 *inpos = newpos;
8261 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 }
8263 return 0;
8264}
8265
Alexander Belopolsky40018472011-02-26 01:02:56 +00008266PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008267_PyUnicode_EncodeCharmap(PyObject *unicode,
8268 PyObject *mapping,
8269 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 /* output object */
8272 PyObject *res = NULL;
8273 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008274 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008275 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008277 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 PyObject *errorHandler = NULL;
8279 PyObject *exc = NULL;
8280 /* the following variable is used for caching string comparisons
8281 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8282 * 3=ignore, 4=xmlcharrefreplace */
8283 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008284 void *data;
8285 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286
Benjamin Petersonbac79492012-01-14 13:34:47 -05008287 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008288 return NULL;
8289 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008290 data = PyUnicode_DATA(unicode);
8291 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008292
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 /* Default to Latin-1 */
8294 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008295 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 /* allocate enough for a simple encoding without
8298 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008299 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 if (res == NULL)
8301 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008302 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008306 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 if (x==enc_EXCEPTION) /* error */
8310 goto onError;
8311 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008312 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 &exc,
8314 &known_errorHandler, &errorHandler, errors,
8315 &res, &respos)) {
8316 goto onError;
8317 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008318 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 else
8320 /* done with this character => adjust input position */
8321 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008325 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008326 if (_PyBytes_Resize(&res, respos) < 0)
8327 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 Py_XDECREF(exc);
8330 Py_XDECREF(errorHandler);
8331 return res;
8332
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334 Py_XDECREF(res);
8335 Py_XDECREF(exc);
8336 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 return NULL;
8338}
8339
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008340/* Deprecated */
8341PyObject *
8342PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8343 Py_ssize_t size,
8344 PyObject *mapping,
8345 const char *errors)
8346{
8347 PyObject *result;
8348 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8349 if (unicode == NULL)
8350 return NULL;
8351 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8352 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008353 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008354}
8355
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356PyObject *
8357PyUnicode_AsCharmapString(PyObject *unicode,
8358 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359{
8360 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 PyErr_BadArgument();
8362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008364 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365}
8366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368static void
8369make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008371 Py_ssize_t startpos, Py_ssize_t endpos,
8372 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 *exceptionObject = _PyUnicodeTranslateError_Create(
8376 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 }
8378 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8380 goto onError;
8381 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8382 goto onError;
8383 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8384 goto onError;
8385 return;
8386 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008387 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 }
8389}
8390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391/* error handling callback helper:
8392 build arguments, call the callback and check the arguments,
8393 put the result into newpos and return the replacement string, which
8394 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395static PyObject *
8396unicode_translate_call_errorhandler(const char *errors,
8397 PyObject **errorHandler,
8398 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008400 Py_ssize_t startpos, Py_ssize_t endpos,
8401 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008403 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008405 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 PyObject *restuple;
8407 PyObject *resunicode;
8408
8409 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 }
8414
8415 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419
8420 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008425 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 Py_DECREF(restuple);
8427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 }
8429 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 &resunicode, &i_newpos)) {
8431 Py_DECREF(restuple);
8432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008434 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 else
8437 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8440 Py_DECREF(restuple);
8441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 Py_INCREF(resunicode);
8444 Py_DECREF(restuple);
8445 return resunicode;
8446}
8447
8448/* Lookup the character ch in the mapping and put the result in result,
8449 which must be decrefed by the caller.
8450 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008451static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453{
Christian Heimes217cfd12007-12-02 14:31:20 +00008454 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455 PyObject *x;
8456
8457 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 x = PyObject_GetItem(mapping, w);
8460 Py_DECREF(w);
8461 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8463 /* No mapping found means: use 1:1 mapping. */
8464 PyErr_Clear();
8465 *result = NULL;
8466 return 0;
8467 } else
8468 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 }
8470 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 *result = x;
8472 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008474 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 long value = PyLong_AS_LONG(x);
8476 long max = PyUnicode_GetMax();
8477 if (value < 0 || value > max) {
8478 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008479 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 Py_DECREF(x);
8481 return -1;
8482 }
8483 *result = x;
8484 return 0;
8485 }
8486 else if (PyUnicode_Check(x)) {
8487 *result = x;
8488 return 0;
8489 }
8490 else {
8491 /* wrong return value */
8492 PyErr_SetString(PyExc_TypeError,
8493 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 Py_DECREF(x);
8495 return -1;
8496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497}
8498/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 if not reallocate and adjust various state variables.
8500 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008501static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008506 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008507 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 /* exponentially overallocate to minimize reallocations */
8509 if (requiredsize < 2 * oldsize)
8510 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008511 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8512 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008514 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516 }
8517 return 0;
8518}
8519/* lookup the character, put the result in the output string and adjust
8520 various state variables. Return a new reference to the object that
8521 was put in the output buffer in *result, or Py_None, if the mapping was
8522 undefined (in which case no character was written).
8523 The called must decref result.
8524 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008525static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8527 PyObject *mapping, Py_UCS4 **output,
8528 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008529 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8532 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 }
8538 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008540 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 }
8544 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 Py_ssize_t repsize;
8546 if (PyUnicode_READY(*res) == -1)
8547 return -1;
8548 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 if (repsize==1) {
8550 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 }
8553 else if (repsize!=0) {
8554 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555 Py_ssize_t requiredsize = *opos +
8556 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 Py_ssize_t i;
8559 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 for(i = 0; i < repsize; i++)
8562 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 }
8565 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 return 0;
8568}
8569
Alexander Belopolsky40018472011-02-26 01:02:56 +00008570PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571_PyUnicode_TranslateCharmap(PyObject *input,
8572 PyObject *mapping,
8573 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 /* input object */
8576 char *idata;
8577 Py_ssize_t size, i;
8578 int kind;
8579 /* output buffer */
8580 Py_UCS4 *output = NULL;
8581 Py_ssize_t osize;
8582 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 char *reason = "character maps to <undefined>";
8586 PyObject *errorHandler = NULL;
8587 PyObject *exc = NULL;
8588 /* the following variable is used for caching string comparisons
8589 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8590 * 3=ignore, 4=xmlcharrefreplace */
8591 int known_errorHandler = -1;
8592
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 PyErr_BadArgument();
8595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 if (PyUnicode_READY(input) == -1)
8599 return NULL;
8600 idata = (char*)PyUnicode_DATA(input);
8601 kind = PyUnicode_KIND(input);
8602 size = PyUnicode_GET_LENGTH(input);
8603 i = 0;
8604
8605 if (size == 0) {
8606 Py_INCREF(input);
8607 return input;
8608 }
8609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610 /* allocate enough for a simple 1:1 translation without
8611 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 osize = size;
8613 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8614 opos = 0;
8615 if (output == NULL) {
8616 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 /* try to encode it */
8622 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 if (charmaptranslate_output(input, i, mapping,
8624 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 Py_XDECREF(x);
8626 goto onError;
8627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008628 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 else { /* untranslatable character */
8632 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8633 Py_ssize_t repsize;
8634 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 Py_ssize_t collstart = i;
8638 Py_ssize_t collend = i+1;
8639 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 while (collend < size) {
8643 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 goto onError;
8645 Py_XDECREF(x);
8646 if (x!=Py_None)
8647 break;
8648 ++collend;
8649 }
8650 /* cache callback name lookup
8651 * (if not done yet, i.e. it's the first error) */
8652 if (known_errorHandler==-1) {
8653 if ((errors==NULL) || (!strcmp(errors, "strict")))
8654 known_errorHandler = 1;
8655 else if (!strcmp(errors, "replace"))
8656 known_errorHandler = 2;
8657 else if (!strcmp(errors, "ignore"))
8658 known_errorHandler = 3;
8659 else if (!strcmp(errors, "xmlcharrefreplace"))
8660 known_errorHandler = 4;
8661 else
8662 known_errorHandler = 0;
8663 }
8664 switch (known_errorHandler) {
8665 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008666 make_translate_exception(&exc,
8667 input, collstart, collend, reason);
8668 if (exc != NULL)
8669 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008670 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 case 2: /* replace */
8672 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 for (coll = collstart; coll<collend; coll++)
8674 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 /* fall through */
8676 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 break;
8679 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 /* generate replacement (temporarily (mis)uses i) */
8681 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 char buffer[2+29+1+1];
8683 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8685 if (charmaptranslate_makespace(&output, &osize,
8686 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 goto onError;
8688 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 break;
8693 default:
8694 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 reason, input, &exc,
8696 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008697 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008699 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008700 Py_DECREF(repunicode);
8701 goto onError;
8702 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 repsize = PyUnicode_GET_LENGTH(repunicode);
8705 if (charmaptranslate_makespace(&output, &osize,
8706 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 Py_DECREF(repunicode);
8708 goto onError;
8709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 for (uni2 = 0; repsize-->0; ++uni2)
8711 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8712 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008715 }
8716 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8718 if (!res)
8719 goto onError;
8720 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721 Py_XDECREF(exc);
8722 Py_XDECREF(errorHandler);
8723 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 Py_XDECREF(exc);
8728 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729 return NULL;
8730}
8731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732/* Deprecated. Use PyUnicode_Translate instead. */
8733PyObject *
8734PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8735 Py_ssize_t size,
8736 PyObject *mapping,
8737 const char *errors)
8738{
Christian Heimes5f520f42012-09-11 14:03:25 +02008739 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8741 if (!unicode)
8742 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008743 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8744 Py_DECREF(unicode);
8745 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746}
8747
Alexander Belopolsky40018472011-02-26 01:02:56 +00008748PyObject *
8749PyUnicode_Translate(PyObject *str,
8750 PyObject *mapping,
8751 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752{
8753 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008754
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755 str = PyUnicode_FromObject(str);
8756 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008757 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 Py_DECREF(str);
8760 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761}
Tim Petersced69f82003-09-16 20:30:58 +00008762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008764fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765{
8766 /* No need to call PyUnicode_READY(self) because this function is only
8767 called as a callback from fixup() which does it already. */
8768 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8769 const int kind = PyUnicode_KIND(self);
8770 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008771 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008772 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 Py_ssize_t i;
8774
8775 for (i = 0; i < len; ++i) {
8776 ch = PyUnicode_READ(kind, data, i);
8777 fixed = 0;
8778 if (ch > 127) {
8779 if (Py_UNICODE_ISSPACE(ch))
8780 fixed = ' ';
8781 else {
8782 const int decimal = Py_UNICODE_TODECIMAL(ch);
8783 if (decimal >= 0)
8784 fixed = '0' + decimal;
8785 }
8786 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008787 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008788 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 PyUnicode_WRITE(kind, data, i, fixed);
8790 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008791 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008792 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 }
8795
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008796 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797}
8798
8799PyObject *
8800_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8801{
8802 if (!PyUnicode_Check(unicode)) {
8803 PyErr_BadInternalCall();
8804 return NULL;
8805 }
8806 if (PyUnicode_READY(unicode) == -1)
8807 return NULL;
8808 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8809 /* If the string is already ASCII, just return the same string */
8810 Py_INCREF(unicode);
8811 return unicode;
8812 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008813 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814}
8815
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008816PyObject *
8817PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8818 Py_ssize_t length)
8819{
Victor Stinnerf0124502011-11-21 23:12:56 +01008820 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008821 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008822 Py_UCS4 maxchar;
8823 enum PyUnicode_Kind kind;
8824 void *data;
8825
Victor Stinner99d7ad02012-02-22 13:37:39 +01008826 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008827 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008828 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008829 if (ch > 127) {
8830 int decimal = Py_UNICODE_TODECIMAL(ch);
8831 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008832 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008833 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008834 }
8835 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008836
8837 /* Copy to a new string */
8838 decimal = PyUnicode_New(length, maxchar);
8839 if (decimal == NULL)
8840 return decimal;
8841 kind = PyUnicode_KIND(decimal);
8842 data = PyUnicode_DATA(decimal);
8843 /* Iterate over code points */
8844 for (i = 0; i < length; i++) {
8845 Py_UNICODE ch = s[i];
8846 if (ch > 127) {
8847 int decimal = Py_UNICODE_TODECIMAL(ch);
8848 if (decimal >= 0)
8849 ch = '0' + decimal;
8850 }
8851 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008853 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008854}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008855/* --- Decimal Encoder ---------------------------------------------------- */
8856
Alexander Belopolsky40018472011-02-26 01:02:56 +00008857int
8858PyUnicode_EncodeDecimal(Py_UNICODE *s,
8859 Py_ssize_t length,
8860 char *output,
8861 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008862{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008863 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008864 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008865 enum PyUnicode_Kind kind;
8866 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008867
8868 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 PyErr_BadArgument();
8870 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008871 }
8872
Victor Stinner42bf7752011-11-21 22:52:58 +01008873 unicode = PyUnicode_FromUnicode(s, length);
8874 if (unicode == NULL)
8875 return -1;
8876
Benjamin Petersonbac79492012-01-14 13:34:47 -05008877 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008878 Py_DECREF(unicode);
8879 return -1;
8880 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008881 kind = PyUnicode_KIND(unicode);
8882 data = PyUnicode_DATA(unicode);
8883
Victor Stinnerb84d7232011-11-22 01:50:07 +01008884 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008885 PyObject *exc;
8886 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008888 Py_ssize_t startpos;
8889
8890 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008891
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008893 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008894 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008896 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 decimal = Py_UNICODE_TODECIMAL(ch);
8898 if (decimal >= 0) {
8899 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008900 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 continue;
8902 }
8903 if (0 < ch && ch < 256) {
8904 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008905 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 continue;
8907 }
Victor Stinner6345be92011-11-25 20:09:01 +01008908
Victor Stinner42bf7752011-11-21 22:52:58 +01008909 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008910 exc = NULL;
8911 raise_encode_exception(&exc, "decimal", unicode,
8912 startpos, startpos+1,
8913 "invalid decimal Unicode string");
8914 Py_XDECREF(exc);
8915 Py_DECREF(unicode);
8916 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008917 }
8918 /* 0-terminate the output string */
8919 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008920 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008921 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008922}
8923
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924/* --- Helpers ------------------------------------------------------------ */
8925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008927any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 Py_ssize_t start,
8929 Py_ssize_t end)
8930{
8931 int kind1, kind2, kind;
8932 void *buf1, *buf2;
8933 Py_ssize_t len1, len2, result;
8934
8935 kind1 = PyUnicode_KIND(s1);
8936 kind2 = PyUnicode_KIND(s2);
8937 kind = kind1 > kind2 ? kind1 : kind2;
8938 buf1 = PyUnicode_DATA(s1);
8939 buf2 = PyUnicode_DATA(s2);
8940 if (kind1 != kind)
8941 buf1 = _PyUnicode_AsKind(s1, kind);
8942 if (!buf1)
8943 return -2;
8944 if (kind2 != kind)
8945 buf2 = _PyUnicode_AsKind(s2, kind);
8946 if (!buf2) {
8947 if (kind1 != kind) PyMem_Free(buf1);
8948 return -2;
8949 }
8950 len1 = PyUnicode_GET_LENGTH(s1);
8951 len2 = PyUnicode_GET_LENGTH(s2);
8952
Victor Stinner794d5672011-10-10 03:21:36 +02008953 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008954 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008955 case PyUnicode_1BYTE_KIND:
8956 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8957 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8958 else
8959 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8960 break;
8961 case PyUnicode_2BYTE_KIND:
8962 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8963 break;
8964 case PyUnicode_4BYTE_KIND:
8965 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8966 break;
8967 default:
8968 assert(0); result = -2;
8969 }
8970 }
8971 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008972 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008973 case PyUnicode_1BYTE_KIND:
8974 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8975 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8976 else
8977 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8978 break;
8979 case PyUnicode_2BYTE_KIND:
8980 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8981 break;
8982 case PyUnicode_4BYTE_KIND:
8983 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8984 break;
8985 default:
8986 assert(0); result = -2;
8987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 }
8989
8990 if (kind1 != kind)
8991 PyMem_Free(buf1);
8992 if (kind2 != kind)
8993 PyMem_Free(buf2);
8994
8995 return result;
8996}
8997
8998Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008999_PyUnicode_InsertThousandsGrouping(
9000 PyObject *unicode, Py_ssize_t index,
9001 Py_ssize_t n_buffer,
9002 void *digits, Py_ssize_t n_digits,
9003 Py_ssize_t min_width,
9004 const char *grouping, PyObject *thousands_sep,
9005 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006{
Victor Stinner41a863c2012-02-24 00:37:51 +01009007 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009008 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009009 Py_ssize_t thousands_sep_len;
9010 Py_ssize_t len;
9011
9012 if (unicode != NULL) {
9013 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009014 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009015 }
9016 else {
9017 kind = PyUnicode_1BYTE_KIND;
9018 data = NULL;
9019 }
9020 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9021 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9022 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9023 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009024 if (thousands_sep_kind < kind) {
9025 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9026 if (!thousands_sep_data)
9027 return -1;
9028 }
9029 else {
9030 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9031 if (!data)
9032 return -1;
9033 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009034 }
9035
Benjamin Petersonead6b532011-12-20 17:23:42 -06009036 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009037 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009038 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009039 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009040 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009041 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009042 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009043 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009044 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009045 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009046 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009047 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009048 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009050 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009051 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009052 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009053 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009054 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009056 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009057 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009058 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009059 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009060 break;
9061 default:
9062 assert(0);
9063 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009065 if (unicode != NULL && thousands_sep_kind != kind) {
9066 if (thousands_sep_kind < kind)
9067 PyMem_Free(thousands_sep_data);
9068 else
9069 PyMem_Free(data);
9070 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009071 if (unicode == NULL) {
9072 *maxchar = 127;
9073 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009074 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009075 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009076 }
9077 }
9078 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079}
9080
9081
Thomas Wouters477c8d52006-05-27 19:21:47 +00009082/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009083#define ADJUST_INDICES(start, end, len) \
9084 if (end > len) \
9085 end = len; \
9086 else if (end < 0) { \
9087 end += len; \
9088 if (end < 0) \
9089 end = 0; \
9090 } \
9091 if (start < 0) { \
9092 start += len; \
9093 if (start < 0) \
9094 start = 0; \
9095 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009096
Alexander Belopolsky40018472011-02-26 01:02:56 +00009097Py_ssize_t
9098PyUnicode_Count(PyObject *str,
9099 PyObject *substr,
9100 Py_ssize_t start,
9101 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009103 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009104 PyObject* str_obj;
9105 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106 int kind1, kind2, kind;
9107 void *buf1 = NULL, *buf2 = NULL;
9108 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009109
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009110 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009111 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009112 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009113 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009114 if (!sub_obj) {
9115 Py_DECREF(str_obj);
9116 return -1;
9117 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009118 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009119 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 Py_DECREF(str_obj);
9121 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 }
Tim Petersced69f82003-09-16 20:30:58 +00009123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 kind1 = PyUnicode_KIND(str_obj);
9125 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009126 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009129 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009130 if (kind2 > kind) {
9131 Py_DECREF(sub_obj);
9132 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009133 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009134 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009135 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137 if (!buf2)
9138 goto onError;
9139 len1 = PyUnicode_GET_LENGTH(str_obj);
9140 len2 = PyUnicode_GET_LENGTH(sub_obj);
9141
9142 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009143 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009145 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9146 result = asciilib_count(
9147 ((Py_UCS1*)buf1) + start, end - start,
9148 buf2, len2, PY_SSIZE_T_MAX
9149 );
9150 else
9151 result = ucs1lib_count(
9152 ((Py_UCS1*)buf1) + start, end - start,
9153 buf2, len2, PY_SSIZE_T_MAX
9154 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 break;
9156 case PyUnicode_2BYTE_KIND:
9157 result = ucs2lib_count(
9158 ((Py_UCS2*)buf1) + start, end - start,
9159 buf2, len2, PY_SSIZE_T_MAX
9160 );
9161 break;
9162 case PyUnicode_4BYTE_KIND:
9163 result = ucs4lib_count(
9164 ((Py_UCS4*)buf1) + start, end - start,
9165 buf2, len2, PY_SSIZE_T_MAX
9166 );
9167 break;
9168 default:
9169 assert(0); result = 0;
9170 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009171
9172 Py_DECREF(sub_obj);
9173 Py_DECREF(str_obj);
9174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 if (kind2 != kind)
9176 PyMem_Free(buf2);
9177
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 onError:
9180 Py_DECREF(sub_obj);
9181 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 if (kind2 != kind && buf2)
9183 PyMem_Free(buf2);
9184 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185}
9186
Alexander Belopolsky40018472011-02-26 01:02:56 +00009187Py_ssize_t
9188PyUnicode_Find(PyObject *str,
9189 PyObject *sub,
9190 Py_ssize_t start,
9191 Py_ssize_t end,
9192 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009194 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009195
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009197 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009199 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009200 if (!sub) {
9201 Py_DECREF(str);
9202 return -2;
9203 }
9204 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9205 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 Py_DECREF(str);
9207 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 }
Tim Petersced69f82003-09-16 20:30:58 +00009209
Victor Stinner794d5672011-10-10 03:21:36 +02009210 result = any_find_slice(direction,
9211 str, sub, start, end
9212 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009213
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009215 Py_DECREF(sub);
9216
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 return result;
9218}
9219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220Py_ssize_t
9221PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9222 Py_ssize_t start, Py_ssize_t end,
9223 int direction)
9224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009226 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 if (PyUnicode_READY(str) == -1)
9228 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009229 if (start < 0 || end < 0) {
9230 PyErr_SetString(PyExc_IndexError, "string index out of range");
9231 return -2;
9232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 if (end > PyUnicode_GET_LENGTH(str))
9234 end = PyUnicode_GET_LENGTH(str);
9235 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009236 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9237 kind, end-start, ch, direction);
9238 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009240 else
9241 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242}
9243
Alexander Belopolsky40018472011-02-26 01:02:56 +00009244static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009245tailmatch(PyObject *self,
9246 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009247 Py_ssize_t start,
9248 Py_ssize_t end,
9249 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 int kind_self;
9252 int kind_sub;
9253 void *data_self;
9254 void *data_sub;
9255 Py_ssize_t offset;
9256 Py_ssize_t i;
9257 Py_ssize_t end_sub;
9258
9259 if (PyUnicode_READY(self) == -1 ||
9260 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009261 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262
9263 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 return 1;
9265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9267 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009269 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 kind_self = PyUnicode_KIND(self);
9272 data_self = PyUnicode_DATA(self);
9273 kind_sub = PyUnicode_KIND(substring);
9274 data_sub = PyUnicode_DATA(substring);
9275 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9276
9277 if (direction > 0)
9278 offset = end;
9279 else
9280 offset = start;
9281
9282 if (PyUnicode_READ(kind_self, data_self, offset) ==
9283 PyUnicode_READ(kind_sub, data_sub, 0) &&
9284 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9285 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9286 /* If both are of the same kind, memcmp is sufficient */
9287 if (kind_self == kind_sub) {
9288 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009289 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 data_sub,
9291 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009292 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 }
9294 /* otherwise we have to compare each character by first accesing it */
9295 else {
9296 /* We do not need to compare 0 and len(substring)-1 because
9297 the if statement above ensured already that they are equal
9298 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 for (i = 1; i < end_sub; ++i) {
9300 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9301 PyUnicode_READ(kind_sub, data_sub, i))
9302 return 0;
9303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 }
9307
9308 return 0;
9309}
9310
Alexander Belopolsky40018472011-02-26 01:02:56 +00009311Py_ssize_t
9312PyUnicode_Tailmatch(PyObject *str,
9313 PyObject *substr,
9314 Py_ssize_t start,
9315 Py_ssize_t end,
9316 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009318 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009319
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320 str = PyUnicode_FromObject(str);
9321 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323 substr = PyUnicode_FromObject(substr);
9324 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 Py_DECREF(str);
9326 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327 }
Tim Petersced69f82003-09-16 20:30:58 +00009328
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009329 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009330 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009331 Py_DECREF(str);
9332 Py_DECREF(substr);
9333 return result;
9334}
9335
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336/* Apply fixfct filter to the Unicode object self and return a
9337 reference to the modified object */
9338
Alexander Belopolsky40018472011-02-26 01:02:56 +00009339static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009340fixup(PyObject *self,
9341 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 PyObject *u;
9344 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009345 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009347 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009350 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 /* fix functions return the new maximum character in a string,
9353 if the kind of the resulting unicode object does not change,
9354 everything is fine. Otherwise we need to change the string kind
9355 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009356 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009357
9358 if (maxchar_new == 0) {
9359 /* no changes */;
9360 if (PyUnicode_CheckExact(self)) {
9361 Py_DECREF(u);
9362 Py_INCREF(self);
9363 return self;
9364 }
9365 else
9366 return u;
9367 }
9368
Victor Stinnere6abb482012-05-02 01:15:40 +02009369 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370
Victor Stinnereaab6042011-12-11 22:22:39 +01009371 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009373
9374 /* In case the maximum character changed, we need to
9375 convert the string to the new category. */
9376 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9377 if (v == NULL) {
9378 Py_DECREF(u);
9379 return NULL;
9380 }
9381 if (maxchar_new > maxchar_old) {
9382 /* If the maxchar increased so that the kind changed, not all
9383 characters are representable anymore and we need to fix the
9384 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009385 _PyUnicode_FastCopyCharacters(v, 0,
9386 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009387 maxchar_old = fixfct(v);
9388 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 }
9390 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009391 _PyUnicode_FastCopyCharacters(v, 0,
9392 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009394 Py_DECREF(u);
9395 assert(_PyUnicode_CheckConsistency(v, 1));
9396 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397}
9398
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009399static PyObject *
9400ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009402 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9403 char *resdata, *data = PyUnicode_DATA(self);
9404 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009405
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009406 res = PyUnicode_New(len, 127);
9407 if (res == NULL)
9408 return NULL;
9409 resdata = PyUnicode_DATA(res);
9410 if (lower)
9411 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009413 _Py_bytes_upper(resdata, data, len);
9414 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415}
9416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009418handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009420 Py_ssize_t j;
9421 int final_sigma;
9422 Py_UCS4 c;
9423 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009424
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009425 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9426
9427 where ! is a negation and \p{xxx} is a character with property xxx.
9428 */
9429 for (j = i - 1; j >= 0; j--) {
9430 c = PyUnicode_READ(kind, data, j);
9431 if (!_PyUnicode_IsCaseIgnorable(c))
9432 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009433 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009434 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9435 if (final_sigma) {
9436 for (j = i + 1; j < length; j++) {
9437 c = PyUnicode_READ(kind, data, j);
9438 if (!_PyUnicode_IsCaseIgnorable(c))
9439 break;
9440 }
9441 final_sigma = j == length || !_PyUnicode_IsCased(c);
9442 }
9443 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444}
9445
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009446static int
9447lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9448 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009450 /* Obscure special case. */
9451 if (c == 0x3A3) {
9452 mapped[0] = handle_capital_sigma(kind, data, length, i);
9453 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009455 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456}
9457
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009458static Py_ssize_t
9459do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009461 Py_ssize_t i, k = 0;
9462 int n_res, j;
9463 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009464
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009465 c = PyUnicode_READ(kind, data, 0);
9466 n_res = _PyUnicode_ToUpperFull(c, mapped);
9467 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009468 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009469 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009471 for (i = 1; i < length; i++) {
9472 c = PyUnicode_READ(kind, data, i);
9473 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9474 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009475 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009476 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009477 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009478 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009479 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480}
9481
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009482static Py_ssize_t
9483do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9484 Py_ssize_t i, k = 0;
9485
9486 for (i = 0; i < length; i++) {
9487 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9488 int n_res, j;
9489 if (Py_UNICODE_ISUPPER(c)) {
9490 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9491 }
9492 else if (Py_UNICODE_ISLOWER(c)) {
9493 n_res = _PyUnicode_ToUpperFull(c, mapped);
9494 }
9495 else {
9496 n_res = 1;
9497 mapped[0] = c;
9498 }
9499 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009500 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009501 res[k++] = mapped[j];
9502 }
9503 }
9504 return k;
9505}
9506
9507static Py_ssize_t
9508do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9509 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009511 Py_ssize_t i, k = 0;
9512
9513 for (i = 0; i < length; i++) {
9514 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9515 int n_res, j;
9516 if (lower)
9517 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9518 else
9519 n_res = _PyUnicode_ToUpperFull(c, mapped);
9520 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009521 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009522 res[k++] = mapped[j];
9523 }
9524 }
9525 return k;
9526}
9527
9528static Py_ssize_t
9529do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9530{
9531 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9532}
9533
9534static Py_ssize_t
9535do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9536{
9537 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9538}
9539
Benjamin Petersone51757f2012-01-12 21:10:29 -05009540static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009541do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9542{
9543 Py_ssize_t i, k = 0;
9544
9545 for (i = 0; i < length; i++) {
9546 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9547 Py_UCS4 mapped[3];
9548 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9549 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009550 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009551 res[k++] = mapped[j];
9552 }
9553 }
9554 return k;
9555}
9556
9557static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009558do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9559{
9560 Py_ssize_t i, k = 0;
9561 int previous_is_cased;
9562
9563 previous_is_cased = 0;
9564 for (i = 0; i < length; i++) {
9565 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9566 Py_UCS4 mapped[3];
9567 int n_res, j;
9568
9569 if (previous_is_cased)
9570 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9571 else
9572 n_res = _PyUnicode_ToTitleFull(c, mapped);
9573
9574 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009575 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009576 res[k++] = mapped[j];
9577 }
9578
9579 previous_is_cased = _PyUnicode_IsCased(c);
9580 }
9581 return k;
9582}
9583
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009584static PyObject *
9585case_operation(PyObject *self,
9586 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9587{
9588 PyObject *res = NULL;
9589 Py_ssize_t length, newlength = 0;
9590 int kind, outkind;
9591 void *data, *outdata;
9592 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9593
Benjamin Petersoneea48462012-01-16 14:28:50 -05009594 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009595
9596 kind = PyUnicode_KIND(self);
9597 data = PyUnicode_DATA(self);
9598 length = PyUnicode_GET_LENGTH(self);
9599 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9600 if (tmp == NULL)
9601 return PyErr_NoMemory();
9602 newlength = perform(kind, data, length, tmp, &maxchar);
9603 res = PyUnicode_New(newlength, maxchar);
9604 if (res == NULL)
9605 goto leave;
9606 tmpend = tmp + newlength;
9607 outdata = PyUnicode_DATA(res);
9608 outkind = PyUnicode_KIND(res);
9609 switch (outkind) {
9610 case PyUnicode_1BYTE_KIND:
9611 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9612 break;
9613 case PyUnicode_2BYTE_KIND:
9614 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9615 break;
9616 case PyUnicode_4BYTE_KIND:
9617 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9618 break;
9619 default:
9620 assert(0);
9621 break;
9622 }
9623 leave:
9624 PyMem_FREE(tmp);
9625 return res;
9626}
9627
Tim Peters8ce9f162004-08-27 01:49:32 +00009628PyObject *
9629PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009632 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009634 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009635 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9636 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009637 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009639 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009641 int use_memcpy;
9642 unsigned char *res_data = NULL, *sep_data = NULL;
9643 PyObject *last_obj;
9644 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009646 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009647 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009648 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009649 }
9650
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009651 /* NOTE: the following code can't call back into Python code,
9652 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009653 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009654
Tim Peters05eba1f2004-08-27 21:32:02 +00009655 seqlen = PySequence_Fast_GET_SIZE(fseq);
9656 /* If empty sequence, return u"". */
9657 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009658 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009659 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009660 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009661
Tim Peters05eba1f2004-08-27 21:32:02 +00009662 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009663 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009664 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009665 if (seqlen == 1) {
9666 if (PyUnicode_CheckExact(items[0])) {
9667 res = items[0];
9668 Py_INCREF(res);
9669 Py_DECREF(fseq);
9670 return res;
9671 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009672 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009673 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009674 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009675 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009676 /* Set up sep and seplen */
9677 if (separator == NULL) {
9678 /* fall back to a blank space separator */
9679 sep = PyUnicode_FromOrdinal(' ');
9680 if (!sep)
9681 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009682 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009683 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009684 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009685 else {
9686 if (!PyUnicode_Check(separator)) {
9687 PyErr_Format(PyExc_TypeError,
9688 "separator: expected str instance,"
9689 " %.80s found",
9690 Py_TYPE(separator)->tp_name);
9691 goto onError;
9692 }
9693 if (PyUnicode_READY(separator))
9694 goto onError;
9695 sep = separator;
9696 seplen = PyUnicode_GET_LENGTH(separator);
9697 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9698 /* inc refcount to keep this code path symmetric with the
9699 above case of a blank separator */
9700 Py_INCREF(sep);
9701 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009702 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009703 }
9704
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009705 /* There are at least two things to join, or else we have a subclass
9706 * of str in the sequence.
9707 * Do a pre-pass to figure out the total amount of space we'll
9708 * need (sz), and see whether all argument are strings.
9709 */
9710 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009711#ifdef Py_DEBUG
9712 use_memcpy = 0;
9713#else
9714 use_memcpy = 1;
9715#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009716 for (i = 0; i < seqlen; i++) {
9717 const Py_ssize_t old_sz = sz;
9718 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009719 if (!PyUnicode_Check(item)) {
9720 PyErr_Format(PyExc_TypeError,
9721 "sequence item %zd: expected str instance,"
9722 " %.80s found",
9723 i, Py_TYPE(item)->tp_name);
9724 goto onError;
9725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 if (PyUnicode_READY(item) == -1)
9727 goto onError;
9728 sz += PyUnicode_GET_LENGTH(item);
9729 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009730 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009731 if (i != 0)
9732 sz += seplen;
9733 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9734 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009735 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009736 goto onError;
9737 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009738 if (use_memcpy && last_obj != NULL) {
9739 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9740 use_memcpy = 0;
9741 }
9742 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009743 }
Tim Petersced69f82003-09-16 20:30:58 +00009744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009746 if (res == NULL)
9747 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009748
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009749 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009750#ifdef Py_DEBUG
9751 use_memcpy = 0;
9752#else
9753 if (use_memcpy) {
9754 res_data = PyUnicode_1BYTE_DATA(res);
9755 kind = PyUnicode_KIND(res);
9756 if (seplen != 0)
9757 sep_data = PyUnicode_1BYTE_DATA(sep);
9758 }
9759#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009760 if (use_memcpy) {
9761 for (i = 0; i < seqlen; ++i) {
9762 Py_ssize_t itemlen;
9763 item = items[i];
9764
9765 /* Copy item, and maybe the separator. */
9766 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009767 Py_MEMCPY(res_data,
9768 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009769 kind * seplen);
9770 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009771 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009772
9773 itemlen = PyUnicode_GET_LENGTH(item);
9774 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009775 Py_MEMCPY(res_data,
9776 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009777 kind * itemlen);
9778 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009779 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009780 }
9781 assert(res_data == PyUnicode_1BYTE_DATA(res)
9782 + kind * PyUnicode_GET_LENGTH(res));
9783 }
9784 else {
9785 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9786 Py_ssize_t itemlen;
9787 item = items[i];
9788
9789 /* Copy item, and maybe the separator. */
9790 if (i && seplen != 0) {
9791 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9792 res_offset += seplen;
9793 }
9794
9795 itemlen = PyUnicode_GET_LENGTH(item);
9796 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009797 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009798 res_offset += itemlen;
9799 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009800 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009801 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009802 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009803
Tim Peters05eba1f2004-08-27 21:32:02 +00009804 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009806 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808
Benjamin Peterson29060642009-01-31 22:14:21 +00009809 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009810 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009812 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813 return NULL;
9814}
9815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816#define FILL(kind, data, value, start, length) \
9817 do { \
9818 Py_ssize_t i_ = 0; \
9819 assert(kind != PyUnicode_WCHAR_KIND); \
9820 switch ((kind)) { \
9821 case PyUnicode_1BYTE_KIND: { \
9822 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009823 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 break; \
9825 } \
9826 case PyUnicode_2BYTE_KIND: { \
9827 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9828 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9829 break; \
9830 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009831 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9833 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9834 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009835 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 } \
9837 } \
9838 } while (0)
9839
Victor Stinnerd3f08822012-05-29 12:57:52 +02009840void
9841_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9842 Py_UCS4 fill_char)
9843{
9844 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9845 const void *data = PyUnicode_DATA(unicode);
9846 assert(PyUnicode_IS_READY(unicode));
9847 assert(unicode_modifiable(unicode));
9848 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9849 assert(start >= 0);
9850 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9851 FILL(kind, data, fill_char, start, length);
9852}
9853
Victor Stinner3fe55312012-01-04 00:33:50 +01009854Py_ssize_t
9855PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9856 Py_UCS4 fill_char)
9857{
9858 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009859
9860 if (!PyUnicode_Check(unicode)) {
9861 PyErr_BadInternalCall();
9862 return -1;
9863 }
9864 if (PyUnicode_READY(unicode) == -1)
9865 return -1;
9866 if (unicode_check_modifiable(unicode))
9867 return -1;
9868
Victor Stinnerd3f08822012-05-29 12:57:52 +02009869 if (start < 0) {
9870 PyErr_SetString(PyExc_IndexError, "string index out of range");
9871 return -1;
9872 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009873 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9874 PyErr_SetString(PyExc_ValueError,
9875 "fill character is bigger than "
9876 "the string maximum character");
9877 return -1;
9878 }
9879
9880 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9881 length = Py_MIN(maxlen, length);
9882 if (length <= 0)
9883 return 0;
9884
Victor Stinnerd3f08822012-05-29 12:57:52 +02009885 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009886 return length;
9887}
9888
Victor Stinner9310abb2011-10-05 00:59:23 +02009889static PyObject *
9890pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009891 Py_ssize_t left,
9892 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 PyObject *u;
9896 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009897 int kind;
9898 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899
9900 if (left < 0)
9901 left = 0;
9902 if (right < 0)
9903 right = 0;
9904
Victor Stinnerc4b49542011-12-11 22:44:26 +01009905 if (left == 0 && right == 0)
9906 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9909 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009910 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9911 return NULL;
9912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009914 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009916 if (!u)
9917 return NULL;
9918
9919 kind = PyUnicode_KIND(u);
9920 data = PyUnicode_DATA(u);
9921 if (left)
9922 FILL(kind, data, fill, 0, left);
9923 if (right)
9924 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009925 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009926 assert(_PyUnicode_CheckConsistency(u, 1));
9927 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928}
9929
Alexander Belopolsky40018472011-02-26 01:02:56 +00009930PyObject *
9931PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934
9935 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009936 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009938 if (PyUnicode_READY(string) == -1) {
9939 Py_DECREF(string);
9940 return NULL;
9941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942
Benjamin Petersonead6b532011-12-20 17:23:42 -06009943 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009945 if (PyUnicode_IS_ASCII(string))
9946 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009947 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009948 PyUnicode_GET_LENGTH(string), keepends);
9949 else
9950 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009951 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009952 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 break;
9954 case PyUnicode_2BYTE_KIND:
9955 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009956 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 PyUnicode_GET_LENGTH(string), keepends);
9958 break;
9959 case PyUnicode_4BYTE_KIND:
9960 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009961 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 PyUnicode_GET_LENGTH(string), keepends);
9963 break;
9964 default:
9965 assert(0);
9966 list = 0;
9967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968 Py_DECREF(string);
9969 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970}
9971
Alexander Belopolsky40018472011-02-26 01:02:56 +00009972static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009973split(PyObject *self,
9974 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009975 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 int kind1, kind2, kind;
9978 void *buf1, *buf2;
9979 Py_ssize_t len1, len2;
9980 PyObject* out;
9981
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009983 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 if (PyUnicode_READY(self) == -1)
9986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009989 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009991 if (PyUnicode_IS_ASCII(self))
9992 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009993 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994 PyUnicode_GET_LENGTH(self), maxcount
9995 );
9996 else
9997 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009998 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009999 PyUnicode_GET_LENGTH(self), maxcount
10000 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 case PyUnicode_2BYTE_KIND:
10002 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010003 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 PyUnicode_GET_LENGTH(self), maxcount
10005 );
10006 case PyUnicode_4BYTE_KIND:
10007 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010008 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 PyUnicode_GET_LENGTH(self), maxcount
10010 );
10011 default:
10012 assert(0);
10013 return NULL;
10014 }
10015
10016 if (PyUnicode_READY(substring) == -1)
10017 return NULL;
10018
10019 kind1 = PyUnicode_KIND(self);
10020 kind2 = PyUnicode_KIND(substring);
10021 kind = kind1 > kind2 ? kind1 : kind2;
10022 buf1 = PyUnicode_DATA(self);
10023 buf2 = PyUnicode_DATA(substring);
10024 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010025 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 if (!buf1)
10027 return NULL;
10028 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010029 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 if (!buf2) {
10031 if (kind1 != kind) PyMem_Free(buf1);
10032 return NULL;
10033 }
10034 len1 = PyUnicode_GET_LENGTH(self);
10035 len2 = PyUnicode_GET_LENGTH(substring);
10036
Benjamin Petersonead6b532011-12-20 17:23:42 -060010037 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010039 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10040 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010041 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010042 else
10043 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 break;
10046 case PyUnicode_2BYTE_KIND:
10047 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010048 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 break;
10050 case PyUnicode_4BYTE_KIND:
10051 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010052 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 break;
10054 default:
10055 out = NULL;
10056 }
10057 if (kind1 != kind)
10058 PyMem_Free(buf1);
10059 if (kind2 != kind)
10060 PyMem_Free(buf2);
10061 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062}
10063
Alexander Belopolsky40018472011-02-26 01:02:56 +000010064static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010065rsplit(PyObject *self,
10066 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010067 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 int kind1, kind2, kind;
10070 void *buf1, *buf2;
10071 Py_ssize_t len1, len2;
10072 PyObject* out;
10073
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010074 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010075 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 if (PyUnicode_READY(self) == -1)
10078 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010081 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010083 if (PyUnicode_IS_ASCII(self))
10084 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010085 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010086 PyUnicode_GET_LENGTH(self), maxcount
10087 );
10088 else
10089 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010090 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010091 PyUnicode_GET_LENGTH(self), maxcount
10092 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 case PyUnicode_2BYTE_KIND:
10094 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010095 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 PyUnicode_GET_LENGTH(self), maxcount
10097 );
10098 case PyUnicode_4BYTE_KIND:
10099 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010100 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 PyUnicode_GET_LENGTH(self), maxcount
10102 );
10103 default:
10104 assert(0);
10105 return NULL;
10106 }
10107
10108 if (PyUnicode_READY(substring) == -1)
10109 return NULL;
10110
10111 kind1 = PyUnicode_KIND(self);
10112 kind2 = PyUnicode_KIND(substring);
10113 kind = kind1 > kind2 ? kind1 : kind2;
10114 buf1 = PyUnicode_DATA(self);
10115 buf2 = PyUnicode_DATA(substring);
10116 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010117 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 if (!buf1)
10119 return NULL;
10120 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010121 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 if (!buf2) {
10123 if (kind1 != kind) PyMem_Free(buf1);
10124 return NULL;
10125 }
10126 len1 = PyUnicode_GET_LENGTH(self);
10127 len2 = PyUnicode_GET_LENGTH(substring);
10128
Benjamin Petersonead6b532011-12-20 17:23:42 -060010129 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010131 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10132 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010133 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010134 else
10135 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010136 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 break;
10138 case PyUnicode_2BYTE_KIND:
10139 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010140 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 break;
10142 case PyUnicode_4BYTE_KIND:
10143 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010144 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 break;
10146 default:
10147 out = NULL;
10148 }
10149 if (kind1 != kind)
10150 PyMem_Free(buf1);
10151 if (kind2 != kind)
10152 PyMem_Free(buf2);
10153 return out;
10154}
10155
10156static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010157anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10158 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010160 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010162 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10163 return asciilib_find(buf1, len1, buf2, len2, offset);
10164 else
10165 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 case PyUnicode_2BYTE_KIND:
10167 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10168 case PyUnicode_4BYTE_KIND:
10169 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10170 }
10171 assert(0);
10172 return -1;
10173}
10174
10175static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010176anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10177 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010179 switch (kind) {
10180 case PyUnicode_1BYTE_KIND:
10181 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10182 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10183 else
10184 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10185 case PyUnicode_2BYTE_KIND:
10186 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10187 case PyUnicode_4BYTE_KIND:
10188 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10189 }
10190 assert(0);
10191 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010192}
10193
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010194static void
10195replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10196 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10197{
10198 int kind = PyUnicode_KIND(u);
10199 void *data = PyUnicode_DATA(u);
10200 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10201 if (kind == PyUnicode_1BYTE_KIND) {
10202 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10203 (Py_UCS1 *)data + len,
10204 u1, u2, maxcount);
10205 }
10206 else if (kind == PyUnicode_2BYTE_KIND) {
10207 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10208 (Py_UCS2 *)data + len,
10209 u1, u2, maxcount);
10210 }
10211 else {
10212 assert(kind == PyUnicode_4BYTE_KIND);
10213 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10214 (Py_UCS4 *)data + len,
10215 u1, u2, maxcount);
10216 }
10217}
10218
Alexander Belopolsky40018472011-02-26 01:02:56 +000010219static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220replace(PyObject *self, PyObject *str1,
10221 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 PyObject *u;
10224 char *sbuf = PyUnicode_DATA(self);
10225 char *buf1 = PyUnicode_DATA(str1);
10226 char *buf2 = PyUnicode_DATA(str2);
10227 int srelease = 0, release1 = 0, release2 = 0;
10228 int skind = PyUnicode_KIND(self);
10229 int kind1 = PyUnicode_KIND(str1);
10230 int kind2 = PyUnicode_KIND(str2);
10231 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10232 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10233 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010234 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010235 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236
10237 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010238 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010240 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241
Victor Stinner59de0ee2011-10-07 10:01:28 +020010242 if (str1 == str2)
10243 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244
Victor Stinner49a0a212011-10-12 23:46:10 +020010245 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010246 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10247 if (maxchar < maxchar_str1)
10248 /* substring too wide to be present */
10249 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010250 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10251 /* Replacing str1 with str2 may cause a maxchar reduction in the
10252 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010253 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010254 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010257 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010259 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010261 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010262 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010263 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010264
Victor Stinner69ed0f42013-04-09 21:48:24 +020010265 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010266 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010267 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010268 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010269 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010271 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010273
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010274 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10275 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010276 }
10277 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 int rkind = skind;
10279 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010280 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (kind1 < rkind) {
10283 /* widen substring */
10284 buf1 = _PyUnicode_AsKind(str1, rkind);
10285 if (!buf1) goto error;
10286 release1 = 1;
10287 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010288 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010289 if (i < 0)
10290 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 if (rkind > kind2) {
10292 /* widen replacement */
10293 buf2 = _PyUnicode_AsKind(str2, rkind);
10294 if (!buf2) goto error;
10295 release2 = 1;
10296 }
10297 else if (rkind < kind2) {
10298 /* widen self and buf1 */
10299 rkind = kind2;
10300 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010301 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 sbuf = _PyUnicode_AsKind(self, rkind);
10303 if (!sbuf) goto error;
10304 srelease = 1;
10305 buf1 = _PyUnicode_AsKind(str1, rkind);
10306 if (!buf1) goto error;
10307 release1 = 1;
10308 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010309 u = PyUnicode_New(slen, maxchar);
10310 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010312 assert(PyUnicode_KIND(u) == rkind);
10313 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010314
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010315 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010316 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010317 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010319 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010321
10322 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010324 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010325 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010326 if (i == -1)
10327 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010328 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010330 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010332 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010334 }
10335 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010337 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 int rkind = skind;
10339 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010342 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 buf1 = _PyUnicode_AsKind(str1, rkind);
10344 if (!buf1) goto error;
10345 release1 = 1;
10346 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010348 if (n == 0)
10349 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010351 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 buf2 = _PyUnicode_AsKind(str2, rkind);
10353 if (!buf2) goto error;
10354 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010357 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 rkind = kind2;
10359 sbuf = _PyUnicode_AsKind(self, rkind);
10360 if (!sbuf) goto error;
10361 srelease = 1;
10362 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010363 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 buf1 = _PyUnicode_AsKind(str1, rkind);
10365 if (!buf1) goto error;
10366 release1 = 1;
10367 }
10368 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10369 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010370 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 PyErr_SetString(PyExc_OverflowError,
10372 "replace string is too long");
10373 goto error;
10374 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010375 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010376 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010377 _Py_INCREF_UNICODE_EMPTY();
10378 if (!unicode_empty)
10379 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010380 u = unicode_empty;
10381 goto done;
10382 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010383 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 PyErr_SetString(PyExc_OverflowError,
10385 "replace string is too long");
10386 goto error;
10387 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010388 u = PyUnicode_New(new_size, maxchar);
10389 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010391 assert(PyUnicode_KIND(u) == rkind);
10392 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 ires = i = 0;
10394 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010395 while (n-- > 0) {
10396 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010397 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010398 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010400 if (j == -1)
10401 break;
10402 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010403 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010404 memcpy(res + rkind * ires,
10405 sbuf + rkind * i,
10406 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010408 }
10409 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010411 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010413 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010419 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010420 memcpy(res + rkind * ires,
10421 sbuf + rkind * i,
10422 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010423 }
10424 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010425 /* interleave */
10426 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010427 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010429 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010431 if (--n <= 0)
10432 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010433 memcpy(res + rkind * ires,
10434 sbuf + rkind * i,
10435 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 ires++;
10437 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010438 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010439 memcpy(res + rkind * ires,
10440 sbuf + rkind * i,
10441 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010442 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010443 }
10444
10445 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010446 unicode_adjust_maxchar(&u);
10447 if (u == NULL)
10448 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010450
10451 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (srelease)
10453 PyMem_FREE(sbuf);
10454 if (release1)
10455 PyMem_FREE(buf1);
10456 if (release2)
10457 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010458 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010460
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010462 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 if (srelease)
10464 PyMem_FREE(sbuf);
10465 if (release1)
10466 PyMem_FREE(buf1);
10467 if (release2)
10468 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010469 return unicode_result_unchanged(self);
10470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 error:
10472 if (srelease && sbuf)
10473 PyMem_FREE(sbuf);
10474 if (release1 && buf1)
10475 PyMem_FREE(buf1);
10476 if (release2 && buf2)
10477 PyMem_FREE(buf2);
10478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479}
10480
10481/* --- Unicode Object Methods --------------------------------------------- */
10482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010483PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485\n\
10486Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010487characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488
10489static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010490unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010492 if (PyUnicode_READY(self) == -1)
10493 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010494 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495}
10496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010497PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010498 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499\n\
10500Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010501have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
10503static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010504unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010506 if (PyUnicode_READY(self) == -1)
10507 return NULL;
10508 if (PyUnicode_GET_LENGTH(self) == 0)
10509 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010510 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511}
10512
Benjamin Petersond5890c82012-01-14 13:23:30 -050010513PyDoc_STRVAR(casefold__doc__,
10514 "S.casefold() -> str\n\
10515\n\
10516Return a version of S suitable for caseless comparisons.");
10517
10518static PyObject *
10519unicode_casefold(PyObject *self)
10520{
10521 if (PyUnicode_READY(self) == -1)
10522 return NULL;
10523 if (PyUnicode_IS_ASCII(self))
10524 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010525 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010526}
10527
10528
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010529/* Argument converter. Coerces to a single unicode character */
10530
10531static int
10532convert_uc(PyObject *obj, void *addr)
10533{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010535 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010536
Benjamin Peterson14339b62009-01-31 16:36:08 +000010537 uniobj = PyUnicode_FromObject(obj);
10538 if (uniobj == NULL) {
10539 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010540 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541 return 0;
10542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010544 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010545 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010546 Py_DECREF(uniobj);
10547 return 0;
10548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010550 Py_DECREF(uniobj);
10551 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010552}
10553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010554PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010557Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010558done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559
10560static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010561unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010563 Py_ssize_t marg, left;
10564 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 Py_UCS4 fillchar = ' ';
10566
Victor Stinnere9a29352011-10-01 02:14:59 +020010567 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569
Benjamin Petersonbac79492012-01-14 13:34:47 -050010570 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571 return NULL;
10572
Victor Stinnerc4b49542011-12-11 22:44:26 +010010573 if (PyUnicode_GET_LENGTH(self) >= width)
10574 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575
Victor Stinnerc4b49542011-12-11 22:44:26 +010010576 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577 left = marg / 2 + (marg & width & 1);
10578
Victor Stinner9310abb2011-10-05 00:59:23 +020010579 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580}
10581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582/* This function assumes that str1 and str2 are readied by the caller. */
10583
Marc-André Lemburge5034372000-08-08 08:04:29 +000010584static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010585unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010586{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010587#define COMPARE(TYPE1, TYPE2) \
10588 do { \
10589 TYPE1* p1 = (TYPE1 *)data1; \
10590 TYPE2* p2 = (TYPE2 *)data2; \
10591 TYPE1* end = p1 + len; \
10592 Py_UCS4 c1, c2; \
10593 for (; p1 != end; p1++, p2++) { \
10594 c1 = *p1; \
10595 c2 = *p2; \
10596 if (c1 != c2) \
10597 return (c1 < c2) ? -1 : 1; \
10598 } \
10599 } \
10600 while (0)
10601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 int kind1, kind2;
10603 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010604 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 kind1 = PyUnicode_KIND(str1);
10607 kind2 = PyUnicode_KIND(str2);
10608 data1 = PyUnicode_DATA(str1);
10609 data2 = PyUnicode_DATA(str2);
10610 len1 = PyUnicode_GET_LENGTH(str1);
10611 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010612 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010613
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010614 switch(kind1) {
10615 case PyUnicode_1BYTE_KIND:
10616 {
10617 switch(kind2) {
10618 case PyUnicode_1BYTE_KIND:
10619 {
10620 int cmp = memcmp(data1, data2, len);
10621 /* normalize result of memcmp() into the range [-1; 1] */
10622 if (cmp < 0)
10623 return -1;
10624 if (cmp > 0)
10625 return 1;
10626 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010627 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010628 case PyUnicode_2BYTE_KIND:
10629 COMPARE(Py_UCS1, Py_UCS2);
10630 break;
10631 case PyUnicode_4BYTE_KIND:
10632 COMPARE(Py_UCS1, Py_UCS4);
10633 break;
10634 default:
10635 assert(0);
10636 }
10637 break;
10638 }
10639 case PyUnicode_2BYTE_KIND:
10640 {
10641 switch(kind2) {
10642 case PyUnicode_1BYTE_KIND:
10643 COMPARE(Py_UCS2, Py_UCS1);
10644 break;
10645 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010646 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010647 COMPARE(Py_UCS2, Py_UCS2);
10648 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010649 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010650 case PyUnicode_4BYTE_KIND:
10651 COMPARE(Py_UCS2, Py_UCS4);
10652 break;
10653 default:
10654 assert(0);
10655 }
10656 break;
10657 }
10658 case PyUnicode_4BYTE_KIND:
10659 {
10660 switch(kind2) {
10661 case PyUnicode_1BYTE_KIND:
10662 COMPARE(Py_UCS4, Py_UCS1);
10663 break;
10664 case PyUnicode_2BYTE_KIND:
10665 COMPARE(Py_UCS4, Py_UCS2);
10666 break;
10667 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010668 {
10669#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10670 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10671 /* normalize result of wmemcmp() into the range [-1; 1] */
10672 if (cmp < 0)
10673 return -1;
10674 if (cmp > 0)
10675 return 1;
10676#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010677 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010678#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010679 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010680 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010681 default:
10682 assert(0);
10683 }
10684 break;
10685 }
10686 default:
10687 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010688 }
10689
Victor Stinner770e19e2012-10-04 22:59:45 +020010690 if (len1 == len2)
10691 return 0;
10692 if (len1 < len2)
10693 return -1;
10694 else
10695 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010696
10697#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010698}
10699
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010700Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010701unicode_compare_eq(PyObject *str1, PyObject *str2)
10702{
10703 int kind;
10704 void *data1, *data2;
10705 Py_ssize_t len;
10706 int cmp;
10707
Victor Stinnere5567ad2012-10-23 02:48:49 +020010708 len = PyUnicode_GET_LENGTH(str1);
10709 if (PyUnicode_GET_LENGTH(str2) != len)
10710 return 0;
10711 kind = PyUnicode_KIND(str1);
10712 if (PyUnicode_KIND(str2) != kind)
10713 return 0;
10714 data1 = PyUnicode_DATA(str1);
10715 data2 = PyUnicode_DATA(str2);
10716
10717 cmp = memcmp(data1, data2, len * kind);
10718 return (cmp == 0);
10719}
10720
10721
Alexander Belopolsky40018472011-02-26 01:02:56 +000010722int
10723PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10726 if (PyUnicode_READY(left) == -1 ||
10727 PyUnicode_READY(right) == -1)
10728 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010729
10730 /* a string is equal to itself */
10731 if (left == right)
10732 return 0;
10733
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010734 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010736 PyErr_Format(PyExc_TypeError,
10737 "Can't compare %.100s and %.100s",
10738 left->ob_type->tp_name,
10739 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 return -1;
10741}
10742
Martin v. Löwis5b222132007-06-10 09:51:05 +000010743int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010744_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10745{
10746 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10747 if (right_str == NULL)
10748 return -1;
10749 return PyUnicode_Compare(left, right_str);
10750}
10751
10752int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010753PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 Py_ssize_t i;
10756 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 Py_UCS4 chr;
10758
Victor Stinner910337b2011-10-03 03:20:16 +020010759 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 if (PyUnicode_READY(uni) == -1)
10761 return -1;
10762 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010763 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010764 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010765 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010766 size_t len, len2 = strlen(str);
10767 int cmp;
10768
10769 len = Py_MIN(len1, len2);
10770 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010771 if (cmp != 0) {
10772 if (cmp < 0)
10773 return -1;
10774 else
10775 return 1;
10776 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010777 if (len1 > len2)
10778 return 1; /* uni is longer */
10779 if (len2 > len1)
10780 return -1; /* str is longer */
10781 return 0;
10782 }
10783 else {
10784 void *data = PyUnicode_DATA(uni);
10785 /* Compare Unicode string and source character set string */
10786 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10787 if (chr != str[i])
10788 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10789 /* This check keeps Python strings that end in '\0' from comparing equal
10790 to C strings identical up to that point. */
10791 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10792 return 1; /* uni is longer */
10793 if (str[i])
10794 return -1; /* str is longer */
10795 return 0;
10796 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010797}
10798
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010799
Benjamin Peterson29060642009-01-31 22:14:21 +000010800#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010801 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010802
Alexander Belopolsky40018472011-02-26 01:02:56 +000010803PyObject *
10804PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010805{
10806 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010807 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010808
Victor Stinnere5567ad2012-10-23 02:48:49 +020010809 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10810 Py_RETURN_NOTIMPLEMENTED;
10811
10812 if (PyUnicode_READY(left) == -1 ||
10813 PyUnicode_READY(right) == -1)
10814 return NULL;
10815
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010816 if (left == right) {
10817 switch (op) {
10818 case Py_EQ:
10819 case Py_LE:
10820 case Py_GE:
10821 /* a string is equal to itself */
10822 v = Py_True;
10823 break;
10824 case Py_NE:
10825 case Py_LT:
10826 case Py_GT:
10827 v = Py_False;
10828 break;
10829 default:
10830 PyErr_BadArgument();
10831 return NULL;
10832 }
10833 }
10834 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010835 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010836 result ^= (op == Py_NE);
10837 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010838 }
10839 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010840 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010841
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010842 /* Convert the return value to a Boolean */
10843 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010844 case Py_LE:
10845 v = TEST_COND(result <= 0);
10846 break;
10847 case Py_GE:
10848 v = TEST_COND(result >= 0);
10849 break;
10850 case Py_LT:
10851 v = TEST_COND(result == -1);
10852 break;
10853 case Py_GT:
10854 v = TEST_COND(result == 1);
10855 break;
10856 default:
10857 PyErr_BadArgument();
10858 return NULL;
10859 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010860 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010861 Py_INCREF(v);
10862 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010863}
10864
Alexander Belopolsky40018472011-02-26 01:02:56 +000010865int
10866PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010867{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010868 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010869 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 void *buf1, *buf2;
10871 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010872 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010873
10874 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010875 sub = PyUnicode_FromObject(element);
10876 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010877 PyErr_Format(PyExc_TypeError,
10878 "'in <string>' requires string as left operand, not %s",
10879 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010880 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010881 }
10882
Thomas Wouters477c8d52006-05-27 19:21:47 +000010883 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010884 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010885 Py_DECREF(sub);
10886 return -1;
10887 }
10888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 kind1 = PyUnicode_KIND(str);
10890 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 buf1 = PyUnicode_DATA(str);
10892 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010893 if (kind2 != kind1) {
10894 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010895 Py_DECREF(sub);
10896 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010897 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010898 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010899 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 if (!buf2) {
10902 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010903 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 return -1;
10905 }
10906 len1 = PyUnicode_GET_LENGTH(str);
10907 len2 = PyUnicode_GET_LENGTH(sub);
10908
Victor Stinner77282cb2013-04-14 19:22:47 +020010909 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 case PyUnicode_1BYTE_KIND:
10911 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10912 break;
10913 case PyUnicode_2BYTE_KIND:
10914 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10915 break;
10916 case PyUnicode_4BYTE_KIND:
10917 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10918 break;
10919 default:
10920 result = -1;
10921 assert(0);
10922 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010923
10924 Py_DECREF(str);
10925 Py_DECREF(sub);
10926
Victor Stinner77282cb2013-04-14 19:22:47 +020010927 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 PyMem_Free(buf2);
10929
Guido van Rossum403d68b2000-03-13 15:55:09 +000010930 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010931}
10932
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933/* Concat to string or Unicode object giving a new Unicode object. */
10934
Alexander Belopolsky40018472011-02-26 01:02:56 +000010935PyObject *
10936PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010939 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010940 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941
10942 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010945 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010948 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949
10950 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010951 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010952 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010955 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010956 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 }
10959
Victor Stinner488fa492011-12-12 00:01:39 +010010960 u_len = PyUnicode_GET_LENGTH(u);
10961 v_len = PyUnicode_GET_LENGTH(v);
10962 if (u_len > PY_SSIZE_T_MAX - v_len) {
10963 PyErr_SetString(PyExc_OverflowError,
10964 "strings are too large to concat");
10965 goto onError;
10966 }
10967 new_len = u_len + v_len;
10968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010970 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010971 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010974 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010977 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10978 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979 Py_DECREF(u);
10980 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010981 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
Benjamin Peterson29060642009-01-31 22:14:21 +000010984 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985 Py_XDECREF(u);
10986 Py_XDECREF(v);
10987 return NULL;
10988}
10989
Walter Dörwald1ab83302007-05-18 17:15:44 +000010990void
Victor Stinner23e56682011-10-03 03:54:37 +020010991PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010992{
Victor Stinner23e56682011-10-03 03:54:37 +020010993 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010994 Py_UCS4 maxchar, maxchar2;
10995 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010996
10997 if (p_left == NULL) {
10998 if (!PyErr_Occurred())
10999 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011000 return;
11001 }
Victor Stinner23e56682011-10-03 03:54:37 +020011002 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011003 if (right == NULL || left == NULL
11004 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011005 if (!PyErr_Occurred())
11006 PyErr_BadInternalCall();
11007 goto error;
11008 }
11009
Benjamin Petersonbac79492012-01-14 13:34:47 -050011010 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011011 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011012 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011013 goto error;
11014
Victor Stinner488fa492011-12-12 00:01:39 +010011015 /* Shortcuts */
11016 if (left == unicode_empty) {
11017 Py_DECREF(left);
11018 Py_INCREF(right);
11019 *p_left = right;
11020 return;
11021 }
11022 if (right == unicode_empty)
11023 return;
11024
11025 left_len = PyUnicode_GET_LENGTH(left);
11026 right_len = PyUnicode_GET_LENGTH(right);
11027 if (left_len > PY_SSIZE_T_MAX - right_len) {
11028 PyErr_SetString(PyExc_OverflowError,
11029 "strings are too large to concat");
11030 goto error;
11031 }
11032 new_len = left_len + right_len;
11033
11034 if (unicode_modifiable(left)
11035 && PyUnicode_CheckExact(right)
11036 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011037 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11038 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011039 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011040 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011041 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11042 {
11043 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011044 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011045 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011046
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011047 /* copy 'right' into the newly allocated area of 'left' */
11048 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011049 }
Victor Stinner488fa492011-12-12 00:01:39 +010011050 else {
11051 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11052 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011053 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011054
Victor Stinner488fa492011-12-12 00:01:39 +010011055 /* Concat the two Unicode strings */
11056 res = PyUnicode_New(new_len, maxchar);
11057 if (res == NULL)
11058 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011059 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11060 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011061 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011062 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011063 }
11064 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011065 return;
11066
11067error:
Victor Stinner488fa492011-12-12 00:01:39 +010011068 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011069}
11070
11071void
11072PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11073{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011074 PyUnicode_Append(pleft, right);
11075 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011076}
11077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011078PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011079 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011081Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011082string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011083interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084
11085static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011086unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011088 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011089 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011090 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 int kind1, kind2, kind;
11093 void *buf1, *buf2;
11094 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095
Jesus Ceaac451502011-04-20 17:09:23 +020011096 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11097 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011098 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 kind1 = PyUnicode_KIND(self);
11101 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011102 if (kind2 > kind1) {
11103 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011104 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011105 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011106 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 buf1 = PyUnicode_DATA(self);
11108 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011110 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 if (!buf2) {
11112 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 return NULL;
11114 }
11115 len1 = PyUnicode_GET_LENGTH(self);
11116 len2 = PyUnicode_GET_LENGTH(substring);
11117
11118 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011119 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 case PyUnicode_1BYTE_KIND:
11121 iresult = ucs1lib_count(
11122 ((Py_UCS1*)buf1) + start, end - start,
11123 buf2, len2, PY_SSIZE_T_MAX
11124 );
11125 break;
11126 case PyUnicode_2BYTE_KIND:
11127 iresult = ucs2lib_count(
11128 ((Py_UCS2*)buf1) + start, end - start,
11129 buf2, len2, PY_SSIZE_T_MAX
11130 );
11131 break;
11132 case PyUnicode_4BYTE_KIND:
11133 iresult = ucs4lib_count(
11134 ((Py_UCS4*)buf1) + start, end - start,
11135 buf2, len2, PY_SSIZE_T_MAX
11136 );
11137 break;
11138 default:
11139 assert(0); iresult = 0;
11140 }
11141
11142 result = PyLong_FromSsize_t(iresult);
11143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 if (kind2 != kind)
11145 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146
11147 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011148
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 return result;
11150}
11151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011152PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011153 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011155Encode S using the codec registered for encoding. Default encoding\n\
11156is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011157handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011158a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11159'xmlcharrefreplace' as well as any other name registered with\n\
11160codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
11162static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011163unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011165 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166 char *encoding = NULL;
11167 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011168
Benjamin Peterson308d6372009-09-18 21:42:35 +000011169 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11170 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011172 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011173}
11174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011175PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011176 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177\n\
11178Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011179If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
11181static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011182unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011184 Py_ssize_t i, j, line_pos, src_len, incr;
11185 Py_UCS4 ch;
11186 PyObject *u;
11187 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011188 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011190 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011191 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
Ezio Melotti745d54d2013-11-16 19:10:57 +020011193 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11194 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011195 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196
Antoine Pitrou22425222011-10-04 19:10:51 +020011197 if (PyUnicode_READY(self) == -1)
11198 return NULL;
11199
Thomas Wouters7e474022000-07-16 12:04:32 +000011200 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011201 src_len = PyUnicode_GET_LENGTH(self);
11202 i = j = line_pos = 0;
11203 kind = PyUnicode_KIND(self);
11204 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011205 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011206 for (; i < src_len; i++) {
11207 ch = PyUnicode_READ(kind, src_data, i);
11208 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011209 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011210 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011211 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011213 goto overflow;
11214 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011216 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011219 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011220 goto overflow;
11221 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011223 if (ch == '\n' || ch == '\r')
11224 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011226 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011227 if (!found)
11228 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011229
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011231 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 if (!u)
11233 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011234 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
Antoine Pitroue71d5742011-10-04 15:55:09 +020011236 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
Antoine Pitroue71d5742011-10-04 15:55:09 +020011238 for (; i < src_len; i++) {
11239 ch = PyUnicode_READ(kind, src_data, i);
11240 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011242 incr = tabsize - (line_pos % tabsize);
11243 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011244 FILL(kind, dest_data, ' ', j, incr);
11245 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011247 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011249 line_pos++;
11250 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011251 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011252 if (ch == '\n' || ch == '\r')
11253 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011255 }
11256 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011257 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011258
Antoine Pitroue71d5742011-10-04 15:55:09 +020011259 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011260 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262}
11263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011264PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266\n\
11267Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011268such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269arguments start and end are interpreted as in slice notation.\n\
11270\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011276 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011277 Py_ssize_t start;
11278 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011279 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Jesus Ceaac451502011-04-20 17:09:23 +020011281 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11282 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Christian Heimesd47802e2013-06-29 21:33:36 +020011285 if (PyUnicode_READY(self) == -1) {
11286 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011288 }
11289 if (PyUnicode_READY(substring) == -1) {
11290 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293
Victor Stinner7931d9a2011-11-04 00:22:48 +010011294 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
11296 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 if (result == -2)
11299 return NULL;
11300
Christian Heimes217cfd12007-12-02 14:31:20 +000011301 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302}
11303
11304static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011305unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011307 void *data;
11308 enum PyUnicode_Kind kind;
11309 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011310
11311 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11312 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011314 }
11315 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11316 PyErr_SetString(PyExc_IndexError, "string index out of range");
11317 return NULL;
11318 }
11319 kind = PyUnicode_KIND(self);
11320 data = PyUnicode_DATA(self);
11321 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011322 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323}
11324
Guido van Rossumc2504932007-09-18 19:42:40 +000011325/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011326 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011327static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011328unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329{
Guido van Rossumc2504932007-09-18 19:42:40 +000011330 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011331 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011332
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011333#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011334 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011335#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 if (_PyUnicode_HASH(self) != -1)
11337 return _PyUnicode_HASH(self);
11338 if (PyUnicode_READY(self) == -1)
11339 return -1;
11340 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011341 /*
11342 We make the hash of the empty string be 0, rather than using
11343 (prefix ^ suffix), since this slightly obfuscates the hash secret
11344 */
11345 if (len == 0) {
11346 _PyUnicode_HASH(self) = 0;
11347 return 0;
11348 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011349 x = _Py_HashBytes(PyUnicode_DATA(self),
11350 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011352 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353}
11354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011355PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011358Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359
11360static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011363 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011364 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011365 Py_ssize_t start;
11366 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
Jesus Ceaac451502011-04-20 17:09:23 +020011368 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11369 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
Christian Heimesd47a0452013-06-29 21:21:37 +020011372 if (PyUnicode_READY(self) == -1) {
11373 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011375 }
11376 if (PyUnicode_READY(substring) == -1) {
11377 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380
Victor Stinner7931d9a2011-11-04 00:22:48 +010011381 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
11383 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 if (result == -2)
11386 return NULL;
11387
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388 if (result < 0) {
11389 PyErr_SetString(PyExc_ValueError, "substring not found");
11390 return NULL;
11391 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011392
Christian Heimes217cfd12007-12-02 14:31:20 +000011393 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394}
11395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011396PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011397 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011399Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011400at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401
11402static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011403unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 Py_ssize_t i, length;
11406 int kind;
11407 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408 int cased;
11409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (PyUnicode_READY(self) == -1)
11411 return NULL;
11412 length = PyUnicode_GET_LENGTH(self);
11413 kind = PyUnicode_KIND(self);
11414 data = PyUnicode_DATA(self);
11415
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 if (length == 1)
11418 return PyBool_FromLong(
11419 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011421 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011424
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 for (i = 0; i < length; i++) {
11427 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011428
Benjamin Peterson29060642009-01-31 22:14:21 +000011429 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11430 return PyBool_FromLong(0);
11431 else if (!cased && Py_UNICODE_ISLOWER(ch))
11432 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011434 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435}
11436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011440Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011441at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
11443static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011444unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 Py_ssize_t i, length;
11447 int kind;
11448 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 int cased;
11450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 if (PyUnicode_READY(self) == -1)
11452 return NULL;
11453 length = PyUnicode_GET_LENGTH(self);
11454 kind = PyUnicode_KIND(self);
11455 data = PyUnicode_DATA(self);
11456
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (length == 1)
11459 return PyBool_FromLong(
11460 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011462 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011465
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 for (i = 0; i < length; i++) {
11468 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011469
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11471 return PyBool_FromLong(0);
11472 else if (!cased && Py_UNICODE_ISUPPER(ch))
11473 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011475 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476}
11477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011478PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011481Return True if S is a titlecased string and there is at least one\n\
11482character in S, i.e. upper- and titlecase characters may only\n\
11483follow uncased characters and lowercase characters only cased ones.\n\
11484Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485
11486static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011487unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 Py_ssize_t i, length;
11490 int kind;
11491 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 int cased, previous_is_cased;
11493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 if (PyUnicode_READY(self) == -1)
11495 return NULL;
11496 length = PyUnicode_GET_LENGTH(self);
11497 kind = PyUnicode_KIND(self);
11498 data = PyUnicode_DATA(self);
11499
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 if (length == 1) {
11502 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11503 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11504 (Py_UNICODE_ISUPPER(ch) != 0));
11505 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011507 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011510
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 cased = 0;
11512 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 for (i = 0; i < length; i++) {
11514 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011515
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11517 if (previous_is_cased)
11518 return PyBool_FromLong(0);
11519 previous_is_cased = 1;
11520 cased = 1;
11521 }
11522 else if (Py_UNICODE_ISLOWER(ch)) {
11523 if (!previous_is_cased)
11524 return PyBool_FromLong(0);
11525 previous_is_cased = 1;
11526 cased = 1;
11527 }
11528 else
11529 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011531 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532}
11533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011534PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011535 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011537Return True if all characters in S are whitespace\n\
11538and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
11540static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011541unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 Py_ssize_t i, length;
11544 int kind;
11545 void *data;
11546
11547 if (PyUnicode_READY(self) == -1)
11548 return NULL;
11549 length = PyUnicode_GET_LENGTH(self);
11550 kind = PyUnicode_KIND(self);
11551 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (length == 1)
11555 return PyBool_FromLong(
11556 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011558 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 for (i = 0; i < length; i++) {
11563 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011564 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011567 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568}
11569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011570PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011572\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011573Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011574and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011575
11576static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011577unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011578{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 Py_ssize_t i, length;
11580 int kind;
11581 void *data;
11582
11583 if (PyUnicode_READY(self) == -1)
11584 return NULL;
11585 length = PyUnicode_GET_LENGTH(self);
11586 kind = PyUnicode_KIND(self);
11587 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011588
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011589 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 if (length == 1)
11591 return PyBool_FromLong(
11592 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011593
11594 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 for (i = 0; i < length; i++) {
11599 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011601 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011602 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011603}
11604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011605PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011607\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011608Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011610
11611static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011612unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 int kind;
11615 void *data;
11616 Py_ssize_t len, i;
11617
11618 if (PyUnicode_READY(self) == -1)
11619 return NULL;
11620
11621 kind = PyUnicode_KIND(self);
11622 data = PyUnicode_DATA(self);
11623 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011624
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011625 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 if (len == 1) {
11627 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11628 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11629 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011630
11631 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 for (i = 0; i < len; i++) {
11636 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011637 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011639 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011640 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011641}
11642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011643PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011644 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011646Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011647False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648
11649static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011650unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652 Py_ssize_t i, length;
11653 int kind;
11654 void *data;
11655
11656 if (PyUnicode_READY(self) == -1)
11657 return NULL;
11658 length = PyUnicode_GET_LENGTH(self);
11659 kind = PyUnicode_KIND(self);
11660 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 if (length == 1)
11664 return PyBool_FromLong(
11665 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011667 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011669 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 for (i = 0; i < length; i++) {
11672 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011673 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011675 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676}
11677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011678PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011681Return True if all characters in S are digits\n\
11682and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683
11684static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011685unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 Py_ssize_t i, length;
11688 int kind;
11689 void *data;
11690
11691 if (PyUnicode_READY(self) == -1)
11692 return NULL;
11693 length = PyUnicode_GET_LENGTH(self);
11694 kind = PyUnicode_KIND(self);
11695 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 if (length == 1) {
11699 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11700 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011703 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011705 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 for (i = 0; i < length; i++) {
11708 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011711 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712}
11713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011714PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011717Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011718False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719
11720static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011721unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 Py_ssize_t i, length;
11724 int kind;
11725 void *data;
11726
11727 if (PyUnicode_READY(self) == -1)
11728 return NULL;
11729 length = PyUnicode_GET_LENGTH(self);
11730 kind = PyUnicode_KIND(self);
11731 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 if (length == 1)
11735 return PyBool_FromLong(
11736 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011738 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 for (i = 0; i < length; i++) {
11743 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011746 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747}
11748
Martin v. Löwis47383402007-08-15 07:32:56 +000011749int
11750PyUnicode_IsIdentifier(PyObject *self)
11751{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 int kind;
11753 void *data;
11754 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011755 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 if (PyUnicode_READY(self) == -1) {
11758 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 }
11761
11762 /* Special case for empty strings */
11763 if (PyUnicode_GET_LENGTH(self) == 0)
11764 return 0;
11765 kind = PyUnicode_KIND(self);
11766 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011767
11768 /* PEP 3131 says that the first character must be in
11769 XID_Start and subsequent characters in XID_Continue,
11770 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011771 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011772 letters, digits, underscore). However, given the current
11773 definition of XID_Start and XID_Continue, it is sufficient
11774 to check just for these, except that _ must be allowed
11775 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011777 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011778 return 0;
11779
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011780 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011783 return 1;
11784}
11785
11786PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011788\n\
11789Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011790to the language definition.\n\
11791\n\
11792Use keyword.iskeyword() to test for reserved identifiers\n\
11793such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011794
11795static PyObject*
11796unicode_isidentifier(PyObject *self)
11797{
11798 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11799}
11800
Georg Brandl559e5d72008-06-11 18:37:52 +000011801PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011803\n\
11804Return True if all characters in S are considered\n\
11805printable in repr() or S is empty, False otherwise.");
11806
11807static PyObject*
11808unicode_isprintable(PyObject *self)
11809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 Py_ssize_t i, length;
11811 int kind;
11812 void *data;
11813
11814 if (PyUnicode_READY(self) == -1)
11815 return NULL;
11816 length = PyUnicode_GET_LENGTH(self);
11817 kind = PyUnicode_KIND(self);
11818 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011819
11820 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (length == 1)
11822 return PyBool_FromLong(
11823 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 for (i = 0; i < length; i++) {
11826 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011827 Py_RETURN_FALSE;
11828 }
11829 }
11830 Py_RETURN_TRUE;
11831}
11832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011833PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011834 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835\n\
11836Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011837iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
11839static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011840unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011842 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843}
11844
Martin v. Löwis18e16552006-02-15 17:27:45 +000011845static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011846unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 if (PyUnicode_READY(self) == -1)
11849 return -1;
11850 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851}
11852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011853PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011856Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011857done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
11859static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011860unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011862 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 Py_UCS4 fillchar = ' ';
11864
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011865 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 return NULL;
11867
Benjamin Petersonbac79492012-01-14 13:34:47 -050011868 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011869 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870
Victor Stinnerc4b49542011-12-11 22:44:26 +010011871 if (PyUnicode_GET_LENGTH(self) >= width)
11872 return unicode_result_unchanged(self);
11873
11874 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875}
11876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011877PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011880Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881
11882static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011883unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011885 if (PyUnicode_READY(self) == -1)
11886 return NULL;
11887 if (PyUnicode_IS_ASCII(self))
11888 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011889 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890}
11891
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011892#define LEFTSTRIP 0
11893#define RIGHTSTRIP 1
11894#define BOTHSTRIP 2
11895
11896/* Arrays indexed by above */
11897static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11898
11899#define STRIPNAME(i) (stripformat[i]+3)
11900
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011901/* externally visible for str.strip(unicode) */
11902PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011903_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 void *data;
11906 int kind;
11907 Py_ssize_t i, j, len;
11908 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011909 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11912 return NULL;
11913
11914 kind = PyUnicode_KIND(self);
11915 data = PyUnicode_DATA(self);
11916 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011917 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11919 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011920 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011921
Benjamin Peterson14339b62009-01-31 16:36:08 +000011922 i = 0;
11923 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011924 while (i < len) {
11925 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11926 if (!BLOOM(sepmask, ch))
11927 break;
11928 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11929 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 i++;
11931 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011932 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011933
Benjamin Peterson14339b62009-01-31 16:36:08 +000011934 j = len;
11935 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011936 j--;
11937 while (j >= i) {
11938 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11939 if (!BLOOM(sepmask, ch))
11940 break;
11941 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11942 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011944 }
11945
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011947 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011948
Victor Stinner7931d9a2011-11-04 00:22:48 +010011949 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950}
11951
11952PyObject*
11953PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11954{
11955 unsigned char *data;
11956 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011957 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958
Victor Stinnerde636f32011-10-01 03:55:54 +020011959 if (PyUnicode_READY(self) == -1)
11960 return NULL;
11961
Victor Stinner684d5fd2012-05-03 02:32:34 +020011962 length = PyUnicode_GET_LENGTH(self);
11963 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011964
Victor Stinner684d5fd2012-05-03 02:32:34 +020011965 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011966 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967
Victor Stinnerde636f32011-10-01 03:55:54 +020011968 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011969 PyErr_SetString(PyExc_IndexError, "string index out of range");
11970 return NULL;
11971 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011972 if (start >= length || end < start)
11973 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011974
Victor Stinner684d5fd2012-05-03 02:32:34 +020011975 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011976 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011977 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011978 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011979 }
11980 else {
11981 kind = PyUnicode_KIND(self);
11982 data = PyUnicode_1BYTE_DATA(self);
11983 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011984 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011985 length);
11986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988
11989static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011990do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 Py_ssize_t len, i, j;
11993
11994 if (PyUnicode_READY(self) == -1)
11995 return NULL;
11996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011998
Victor Stinnercc7af722013-04-09 22:39:24 +020011999 if (PyUnicode_IS_ASCII(self)) {
12000 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12001
12002 i = 0;
12003 if (striptype != RIGHTSTRIP) {
12004 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012005 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012006 if (!_Py_ascii_whitespace[ch])
12007 break;
12008 i++;
12009 }
12010 }
12011
12012 j = len;
12013 if (striptype != LEFTSTRIP) {
12014 j--;
12015 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012016 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012017 if (!_Py_ascii_whitespace[ch])
12018 break;
12019 j--;
12020 }
12021 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012022 }
12023 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012024 else {
12025 int kind = PyUnicode_KIND(self);
12026 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012027
Victor Stinnercc7af722013-04-09 22:39:24 +020012028 i = 0;
12029 if (striptype != RIGHTSTRIP) {
12030 while (i < len) {
12031 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12032 if (!Py_UNICODE_ISSPACE(ch))
12033 break;
12034 i++;
12035 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012036 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012037
12038 j = len;
12039 if (striptype != LEFTSTRIP) {
12040 j--;
12041 while (j >= i) {
12042 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12043 if (!Py_UNICODE_ISSPACE(ch))
12044 break;
12045 j--;
12046 }
12047 j++;
12048 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012049 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012050
Victor Stinner7931d9a2011-11-04 00:22:48 +010012051 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052}
12053
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012054
12055static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012056do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012057{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012058 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012059
Serhiy Storchakac6792272013-10-19 21:03:34 +030012060 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012061 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012062
Benjamin Peterson14339b62009-01-31 16:36:08 +000012063 if (sep != NULL && sep != Py_None) {
12064 if (PyUnicode_Check(sep))
12065 return _PyUnicode_XStrip(self, striptype, sep);
12066 else {
12067 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 "%s arg must be None or str",
12069 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012070 return NULL;
12071 }
12072 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012073
Benjamin Peterson14339b62009-01-31 16:36:08 +000012074 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012075}
12076
12077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012078PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012080\n\
12081Return a copy of the string S with leading and trailing\n\
12082whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012083If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012084
12085static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012086unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012087{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012088 if (PyTuple_GET_SIZE(args) == 0)
12089 return do_strip(self, BOTHSTRIP); /* Common case */
12090 else
12091 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012092}
12093
12094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012095PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012097\n\
12098Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012099If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012100
12101static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012102unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012103{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012104 if (PyTuple_GET_SIZE(args) == 0)
12105 return do_strip(self, LEFTSTRIP); /* Common case */
12106 else
12107 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012108}
12109
12110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012111PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012113\n\
12114Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012115If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116
12117static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012118unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012120 if (PyTuple_GET_SIZE(args) == 0)
12121 return do_strip(self, RIGHTSTRIP); /* Common case */
12122 else
12123 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012124}
12125
12126
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012128unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012130 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132
Serhiy Storchaka05997252013-01-26 12:14:02 +020012133 if (len < 1)
12134 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135
Victor Stinnerc4b49542011-12-11 22:44:26 +010012136 /* no repeat, return original string */
12137 if (len == 1)
12138 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012139
Benjamin Petersonbac79492012-01-14 13:34:47 -050012140 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 return NULL;
12142
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012143 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012144 PyErr_SetString(PyExc_OverflowError,
12145 "repeated string is too long");
12146 return NULL;
12147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012149
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012150 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151 if (!u)
12152 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012153 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (PyUnicode_GET_LENGTH(str) == 1) {
12156 const int kind = PyUnicode_KIND(str);
12157 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012158 if (kind == PyUnicode_1BYTE_KIND) {
12159 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012160 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012161 }
12162 else if (kind == PyUnicode_2BYTE_KIND) {
12163 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012164 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012165 ucs2[n] = fill_char;
12166 } else {
12167 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12168 assert(kind == PyUnicode_4BYTE_KIND);
12169 for (n = 0; n < len; ++n)
12170 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 }
12173 else {
12174 /* number of characters copied this far */
12175 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012176 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 char *to = (char *) PyUnicode_DATA(u);
12178 Py_MEMCPY(to, PyUnicode_DATA(str),
12179 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 n = (done <= nchars-done) ? done : nchars-done;
12182 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012183 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185 }
12186
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012187 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012188 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189}
12190
Alexander Belopolsky40018472011-02-26 01:02:56 +000012191PyObject *
12192PyUnicode_Replace(PyObject *obj,
12193 PyObject *subobj,
12194 PyObject *replobj,
12195 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196{
12197 PyObject *self;
12198 PyObject *str1;
12199 PyObject *str2;
12200 PyObject *result;
12201
12202 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012203 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012204 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012206 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012207 Py_DECREF(self);
12208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209 }
12210 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012211 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012212 Py_DECREF(self);
12213 Py_DECREF(str1);
12214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012216 if (PyUnicode_READY(self) == -1 ||
12217 PyUnicode_READY(str1) == -1 ||
12218 PyUnicode_READY(str2) == -1)
12219 result = NULL;
12220 else
12221 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222 Py_DECREF(self);
12223 Py_DECREF(str1);
12224 Py_DECREF(str2);
12225 return result;
12226}
12227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012228PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012229 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230\n\
12231Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012232old replaced by new. If the optional argument count is\n\
12233given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234
12235static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 PyObject *str1;
12239 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012240 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241 PyObject *result;
12242
Martin v. Löwis18e16552006-02-15 17:27:45 +000012243 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012245 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012248 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 return NULL;
12250 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012251 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 Py_DECREF(str1);
12253 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012254 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012255 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12256 result = NULL;
12257 else
12258 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259
12260 Py_DECREF(str1);
12261 Py_DECREF(str2);
12262 return result;
12263}
12264
Alexander Belopolsky40018472011-02-26 01:02:56 +000012265static PyObject *
12266unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012268 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 Py_ssize_t isize;
12270 Py_ssize_t osize, squote, dquote, i, o;
12271 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012272 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012276 return NULL;
12277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 isize = PyUnicode_GET_LENGTH(unicode);
12279 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 /* Compute length of output, quote characters, and
12282 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012283 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 max = 127;
12285 squote = dquote = 0;
12286 ikind = PyUnicode_KIND(unicode);
12287 for (i = 0; i < isize; i++) {
12288 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12289 switch (ch) {
12290 case '\'': squote++; osize++; break;
12291 case '"': dquote++; osize++; break;
12292 case '\\': case '\t': case '\r': case '\n':
12293 osize += 2; break;
12294 default:
12295 /* Fast-path ASCII */
12296 if (ch < ' ' || ch == 0x7f)
12297 osize += 4; /* \xHH */
12298 else if (ch < 0x7f)
12299 osize++;
12300 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12301 osize++;
12302 max = ch > max ? ch : max;
12303 }
12304 else if (ch < 0x100)
12305 osize += 4; /* \xHH */
12306 else if (ch < 0x10000)
12307 osize += 6; /* \uHHHH */
12308 else
12309 osize += 10; /* \uHHHHHHHH */
12310 }
12311 }
12312
12313 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012314 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012316 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 if (dquote)
12318 /* Both squote and dquote present. Use squote,
12319 and escape them */
12320 osize += squote;
12321 else
12322 quote = '"';
12323 }
Victor Stinner55c08782013-04-14 18:45:39 +020012324 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325
12326 repr = PyUnicode_New(osize, max);
12327 if (repr == NULL)
12328 return NULL;
12329 okind = PyUnicode_KIND(repr);
12330 odata = PyUnicode_DATA(repr);
12331
12332 PyUnicode_WRITE(okind, odata, 0, quote);
12333 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012334 if (unchanged) {
12335 _PyUnicode_FastCopyCharacters(repr, 1,
12336 unicode, 0,
12337 isize);
12338 }
12339 else {
12340 for (i = 0, o = 1; i < isize; i++) {
12341 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342
Victor Stinner55c08782013-04-14 18:45:39 +020012343 /* Escape quotes and backslashes */
12344 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012345 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012347 continue;
12348 }
12349
12350 /* Map special whitespace to '\t', \n', '\r' */
12351 if (ch == '\t') {
12352 PyUnicode_WRITE(okind, odata, o++, '\\');
12353 PyUnicode_WRITE(okind, odata, o++, 't');
12354 }
12355 else if (ch == '\n') {
12356 PyUnicode_WRITE(okind, odata, o++, '\\');
12357 PyUnicode_WRITE(okind, odata, o++, 'n');
12358 }
12359 else if (ch == '\r') {
12360 PyUnicode_WRITE(okind, odata, o++, '\\');
12361 PyUnicode_WRITE(okind, odata, o++, 'r');
12362 }
12363
12364 /* Map non-printable US ASCII to '\xhh' */
12365 else if (ch < ' ' || ch == 0x7F) {
12366 PyUnicode_WRITE(okind, odata, o++, '\\');
12367 PyUnicode_WRITE(okind, odata, o++, 'x');
12368 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12369 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12370 }
12371
12372 /* Copy ASCII characters as-is */
12373 else if (ch < 0x7F) {
12374 PyUnicode_WRITE(okind, odata, o++, ch);
12375 }
12376
12377 /* Non-ASCII characters */
12378 else {
12379 /* Map Unicode whitespace and control characters
12380 (categories Z* and C* except ASCII space)
12381 */
12382 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12383 PyUnicode_WRITE(okind, odata, o++, '\\');
12384 /* Map 8-bit characters to '\xhh' */
12385 if (ch <= 0xff) {
12386 PyUnicode_WRITE(okind, odata, o++, 'x');
12387 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12388 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12389 }
12390 /* Map 16-bit characters to '\uxxxx' */
12391 else if (ch <= 0xffff) {
12392 PyUnicode_WRITE(okind, odata, o++, 'u');
12393 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12394 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12395 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12396 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12397 }
12398 /* Map 21-bit characters to '\U00xxxxxx' */
12399 else {
12400 PyUnicode_WRITE(okind, odata, o++, 'U');
12401 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12402 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12403 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12404 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12405 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12406 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12407 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12408 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12409 }
12410 }
12411 /* Copy characters as-is */
12412 else {
12413 PyUnicode_WRITE(okind, odata, o++, ch);
12414 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012415 }
12416 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012419 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012420 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421}
12422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012423PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012424 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425\n\
12426Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012427such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428arguments start and end are interpreted as in slice notation.\n\
12429\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012430Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431
12432static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012435 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012436 Py_ssize_t start;
12437 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012439
Jesus Ceaac451502011-04-20 17:09:23 +020012440 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12441 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443
Christian Heimesea71a522013-06-29 21:17:34 +020012444 if (PyUnicode_READY(self) == -1) {
12445 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012447 }
12448 if (PyUnicode_READY(substring) == -1) {
12449 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452
Victor Stinner7931d9a2011-11-04 00:22:48 +010012453 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454
12455 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 if (result == -2)
12458 return NULL;
12459
Christian Heimes217cfd12007-12-02 14:31:20 +000012460 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461}
12462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012463PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012464 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012466Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467
12468static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012471 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012472 Py_ssize_t start;
12473 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012474 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475
Jesus Ceaac451502011-04-20 17:09:23 +020012476 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12477 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479
Christian Heimesea71a522013-06-29 21:17:34 +020012480 if (PyUnicode_READY(self) == -1) {
12481 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012483 }
12484 if (PyUnicode_READY(substring) == -1) {
12485 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488
Victor Stinner7931d9a2011-11-04 00:22:48 +010012489 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
12491 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 if (result == -2)
12494 return NULL;
12495
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496 if (result < 0) {
12497 PyErr_SetString(PyExc_ValueError, "substring not found");
12498 return NULL;
12499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500
Christian Heimes217cfd12007-12-02 14:31:20 +000012501 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502}
12503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012504PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012507Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012508done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
12510static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012511unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012513 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 Py_UCS4 fillchar = ' ';
12515
Victor Stinnere9a29352011-10-01 02:14:59 +020012516 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012518
Benjamin Petersonbac79492012-01-14 13:34:47 -050012519 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520 return NULL;
12521
Victor Stinnerc4b49542011-12-11 22:44:26 +010012522 if (PyUnicode_GET_LENGTH(self) >= width)
12523 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
Victor Stinnerc4b49542011-12-11 22:44:26 +010012525 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526}
12527
Alexander Belopolsky40018472011-02-26 01:02:56 +000012528PyObject *
12529PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530{
12531 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012532
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533 s = PyUnicode_FromObject(s);
12534 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012535 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 if (sep != NULL) {
12537 sep = PyUnicode_FromObject(sep);
12538 if (sep == NULL) {
12539 Py_DECREF(s);
12540 return NULL;
12541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542 }
12543
Victor Stinner9310abb2011-10-05 00:59:23 +020012544 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
12546 Py_DECREF(s);
12547 Py_XDECREF(sep);
12548 return result;
12549}
12550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012551PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012552 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553\n\
12554Return a list of the words in S, using sep as the\n\
12555delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012556splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012557whitespace string is a separator and empty strings are\n\
12558removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
12560static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012561unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012563 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012565 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012567 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12568 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569 return NULL;
12570
12571 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012574 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012576 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577}
12578
Thomas Wouters477c8d52006-05-27 19:21:47 +000012579PyObject *
12580PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12581{
12582 PyObject* str_obj;
12583 PyObject* sep_obj;
12584 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 int kind1, kind2, kind;
12586 void *buf1 = NULL, *buf2 = NULL;
12587 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012588
12589 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012590 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012592 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012593 if (!sep_obj) {
12594 Py_DECREF(str_obj);
12595 return NULL;
12596 }
12597 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12598 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012599 Py_DECREF(str_obj);
12600 return NULL;
12601 }
12602
Victor Stinner14f8f022011-10-05 20:58:25 +020012603 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012605 kind = Py_MAX(kind1, kind2);
12606 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012608 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 if (!buf1)
12610 goto onError;
12611 buf2 = PyUnicode_DATA(sep_obj);
12612 if (kind2 != kind)
12613 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12614 if (!buf2)
12615 goto onError;
12616 len1 = PyUnicode_GET_LENGTH(str_obj);
12617 len2 = PyUnicode_GET_LENGTH(sep_obj);
12618
Benjamin Petersonead6b532011-12-20 17:23:42 -060012619 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012621 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12622 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12623 else
12624 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 break;
12626 case PyUnicode_2BYTE_KIND:
12627 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12628 break;
12629 case PyUnicode_4BYTE_KIND:
12630 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12631 break;
12632 default:
12633 assert(0);
12634 out = 0;
12635 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012636
12637 Py_DECREF(sep_obj);
12638 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 if (kind1 != kind)
12640 PyMem_Free(buf1);
12641 if (kind2 != kind)
12642 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012643
12644 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 onError:
12646 Py_DECREF(sep_obj);
12647 Py_DECREF(str_obj);
12648 if (kind1 != kind && buf1)
12649 PyMem_Free(buf1);
12650 if (kind2 != kind && buf2)
12651 PyMem_Free(buf2);
12652 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012653}
12654
12655
12656PyObject *
12657PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12658{
12659 PyObject* str_obj;
12660 PyObject* sep_obj;
12661 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 int kind1, kind2, kind;
12663 void *buf1 = NULL, *buf2 = NULL;
12664 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012665
12666 str_obj = PyUnicode_FromObject(str_in);
12667 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012669 sep_obj = PyUnicode_FromObject(sep_in);
12670 if (!sep_obj) {
12671 Py_DECREF(str_obj);
12672 return NULL;
12673 }
12674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 kind1 = PyUnicode_KIND(str_in);
12676 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012677 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 buf1 = PyUnicode_DATA(str_in);
12679 if (kind1 != kind)
12680 buf1 = _PyUnicode_AsKind(str_in, kind);
12681 if (!buf1)
12682 goto onError;
12683 buf2 = PyUnicode_DATA(sep_obj);
12684 if (kind2 != kind)
12685 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12686 if (!buf2)
12687 goto onError;
12688 len1 = PyUnicode_GET_LENGTH(str_obj);
12689 len2 = PyUnicode_GET_LENGTH(sep_obj);
12690
Benjamin Petersonead6b532011-12-20 17:23:42 -060012691 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012693 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12694 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12695 else
12696 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 break;
12698 case PyUnicode_2BYTE_KIND:
12699 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12700 break;
12701 case PyUnicode_4BYTE_KIND:
12702 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12703 break;
12704 default:
12705 assert(0);
12706 out = 0;
12707 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012708
12709 Py_DECREF(sep_obj);
12710 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 if (kind1 != kind)
12712 PyMem_Free(buf1);
12713 if (kind2 != kind)
12714 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012715
12716 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 onError:
12718 Py_DECREF(sep_obj);
12719 Py_DECREF(str_obj);
12720 if (kind1 != kind && buf1)
12721 PyMem_Free(buf1);
12722 if (kind2 != kind && buf2)
12723 PyMem_Free(buf2);
12724 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012725}
12726
12727PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012729\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012730Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012731the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012732found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012733
12734static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012735unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012736{
Victor Stinner9310abb2011-10-05 00:59:23 +020012737 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012738}
12739
12740PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012741 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012742\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012743Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012744the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012745separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012746
12747static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012748unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012749{
Victor Stinner9310abb2011-10-05 00:59:23 +020012750 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012751}
12752
Alexander Belopolsky40018472011-02-26 01:02:56 +000012753PyObject *
12754PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012755{
12756 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012757
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012758 s = PyUnicode_FromObject(s);
12759 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012760 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012761 if (sep != NULL) {
12762 sep = PyUnicode_FromObject(sep);
12763 if (sep == NULL) {
12764 Py_DECREF(s);
12765 return NULL;
12766 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012767 }
12768
Victor Stinner9310abb2011-10-05 00:59:23 +020012769 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012770
12771 Py_DECREF(s);
12772 Py_XDECREF(sep);
12773 return result;
12774}
12775
12776PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012777 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012778\n\
12779Return a list of the words in S, using sep as the\n\
12780delimiter string, starting at the end of the string and\n\
12781working to the front. If maxsplit is given, at most maxsplit\n\
12782splits are done. If sep is not specified, any whitespace string\n\
12783is a separator.");
12784
12785static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012786unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012787{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012788 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012789 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012790 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012791
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012792 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12793 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012794 return NULL;
12795
12796 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012798 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012799 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012800 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012801 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012802}
12803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012804PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012805 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806\n\
12807Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012808Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012809is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810
12811static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012812unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012814 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012815 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012817 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12818 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819 return NULL;
12820
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012821 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822}
12823
12824static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012825PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012827 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828}
12829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012830PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832\n\
12833Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012834and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835
12836static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012837unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012839 if (PyUnicode_READY(self) == -1)
12840 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012841 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842}
12843
Larry Hastings61272b72014-01-07 12:41:53 -080012844/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012845
Larry Hastings31826802013-10-19 00:09:25 -070012846@staticmethod
12847str.maketrans as unicode_maketrans
12848
12849 x: object
12850
12851 y: unicode=NULL
12852
12853 z: unicode=NULL
12854
12855 /
12856
12857Return a translation table usable for str.translate().
12858
12859If there is only one argument, it must be a dictionary mapping Unicode
12860ordinals (integers) or characters to Unicode ordinals, strings or None.
12861Character keys will be then converted to ordinals.
12862If there are two arguments, they must be strings of equal length, and
12863in the resulting dictionary, each character in x will be mapped to the
12864character at the same position in y. If there is a third argument, it
12865must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012866[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012867
12868PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012869"maketrans(x, y=None, z=None, /)\n"
12870"--\n"
12871"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012872"Return a translation table usable for str.translate().\n"
12873"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012874"If there is only one argument, it must be a dictionary mapping Unicode\n"
12875"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12876"Character keys will be then converted to ordinals.\n"
12877"If there are two arguments, they must be strings of equal length, and\n"
12878"in the resulting dictionary, each character in x will be mapped to the\n"
12879"character at the same position in y. If there is a third argument, it\n"
12880"must be a string, whose characters will be mapped to None in the result.");
12881
12882#define UNICODE_MAKETRANS_METHODDEF \
12883 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12884
12885static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012886unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012887
12888static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012889unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012890{
Larry Hastings31826802013-10-19 00:09:25 -070012891 PyObject *return_value = NULL;
12892 PyObject *x;
12893 PyObject *y = NULL;
12894 PyObject *z = NULL;
12895
12896 if (!PyArg_ParseTuple(args,
12897 "O|UU:maketrans",
12898 &x, &y, &z))
12899 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012900 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012901
12902exit:
12903 return return_value;
12904}
12905
12906static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012907unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012908/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012909{
Georg Brandlceee0772007-11-27 23:48:05 +000012910 PyObject *new = NULL, *key, *value;
12911 Py_ssize_t i = 0;
12912 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012913
Georg Brandlceee0772007-11-27 23:48:05 +000012914 new = PyDict_New();
12915 if (!new)
12916 return NULL;
12917 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918 int x_kind, y_kind, z_kind;
12919 void *x_data, *y_data, *z_data;
12920
Georg Brandlceee0772007-11-27 23:48:05 +000012921 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012922 if (!PyUnicode_Check(x)) {
12923 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12924 "be a string if there is a second argument");
12925 goto err;
12926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012928 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12929 "arguments must have equal length");
12930 goto err;
12931 }
12932 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 x_kind = PyUnicode_KIND(x);
12934 y_kind = PyUnicode_KIND(y);
12935 x_data = PyUnicode_DATA(x);
12936 y_data = PyUnicode_DATA(y);
12937 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12938 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012939 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012940 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012941 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012942 if (!value) {
12943 Py_DECREF(key);
12944 goto err;
12945 }
Georg Brandlceee0772007-11-27 23:48:05 +000012946 res = PyDict_SetItem(new, key, value);
12947 Py_DECREF(key);
12948 Py_DECREF(value);
12949 if (res < 0)
12950 goto err;
12951 }
12952 /* create entries for deleting chars in z */
12953 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 z_kind = PyUnicode_KIND(z);
12955 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012956 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012958 if (!key)
12959 goto err;
12960 res = PyDict_SetItem(new, key, Py_None);
12961 Py_DECREF(key);
12962 if (res < 0)
12963 goto err;
12964 }
12965 }
12966 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 int kind;
12968 void *data;
12969
Georg Brandlceee0772007-11-27 23:48:05 +000012970 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012971 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012972 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12973 "to maketrans it must be a dict");
12974 goto err;
12975 }
12976 /* copy entries into the new dict, converting string keys to int keys */
12977 while (PyDict_Next(x, &i, &key, &value)) {
12978 if (PyUnicode_Check(key)) {
12979 /* convert string keys to integer keys */
12980 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012981 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012982 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12983 "table must be of length 1");
12984 goto err;
12985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 kind = PyUnicode_KIND(key);
12987 data = PyUnicode_DATA(key);
12988 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012989 if (!newkey)
12990 goto err;
12991 res = PyDict_SetItem(new, newkey, value);
12992 Py_DECREF(newkey);
12993 if (res < 0)
12994 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012995 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012996 /* just keep integer keys */
12997 if (PyDict_SetItem(new, key, value) < 0)
12998 goto err;
12999 } else {
13000 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13001 "be strings or integers");
13002 goto err;
13003 }
13004 }
13005 }
13006 return new;
13007 err:
13008 Py_DECREF(new);
13009 return NULL;
13010}
13011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013012PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013013 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014\n\
13015Return a copy of the string S, where all characters have been mapped\n\
13016through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013017Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013018Unmapped characters are left untouched. Characters mapped to None\n\
13019are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020
13021static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013025}
13026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013027PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013028 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013029\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013030Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031
13032static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013033unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013034{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013035 if (PyUnicode_READY(self) == -1)
13036 return NULL;
13037 if (PyUnicode_IS_ASCII(self))
13038 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013039 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040}
13041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013042PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013043 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013045Pad a numeric string S with zeros on the left, to fill a field\n\
13046of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047
13048static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013049unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013051 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013052 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013053 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 int kind;
13055 void *data;
13056 Py_UCS4 chr;
13057
Martin v. Löwis18e16552006-02-15 17:27:45 +000013058 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059 return NULL;
13060
Benjamin Petersonbac79492012-01-14 13:34:47 -050013061 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013062 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063
Victor Stinnerc4b49542011-12-11 22:44:26 +010013064 if (PyUnicode_GET_LENGTH(self) >= width)
13065 return unicode_result_unchanged(self);
13066
13067 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068
13069 u = pad(self, fill, 0, '0');
13070
Walter Dörwald068325e2002-04-15 13:36:47 +000013071 if (u == NULL)
13072 return NULL;
13073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 kind = PyUnicode_KIND(u);
13075 data = PyUnicode_DATA(u);
13076 chr = PyUnicode_READ(kind, data, fill);
13077
13078 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080 PyUnicode_WRITE(kind, data, 0, chr);
13081 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082 }
13083
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013084 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013085 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087
13088#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013089static PyObject *
13090unicode__decimal2ascii(PyObject *self)
13091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013093}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094#endif
13095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013096PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013097 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013099Return True if S starts with the specified prefix, False otherwise.\n\
13100With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013101With optional end, stop comparing S at that position.\n\
13102prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103
13104static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013105unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013108 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013109 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013110 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013111 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013112 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
Jesus Ceaac451502011-04-20 17:09:23 +020013114 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013115 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013116 if (PyTuple_Check(subobj)) {
13117 Py_ssize_t i;
13118 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013119 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013120 if (substring == NULL)
13121 return NULL;
13122 result = tailmatch(self, substring, start, end, -1);
13123 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013124 if (result == -1)
13125 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013126 if (result) {
13127 Py_RETURN_TRUE;
13128 }
13129 }
13130 /* nothing matched */
13131 Py_RETURN_FALSE;
13132 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013133 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013134 if (substring == NULL) {
13135 if (PyErr_ExceptionMatches(PyExc_TypeError))
13136 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13137 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013139 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013140 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013142 if (result == -1)
13143 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013144 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145}
13146
13147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013148PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013149 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013151Return True if S ends with the specified suffix, False otherwise.\n\
13152With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013153With optional end, stop comparing S at that position.\n\
13154suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155
13156static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013157unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013160 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013161 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013162 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013163 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013164 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165
Jesus Ceaac451502011-04-20 17:09:23 +020013166 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013168 if (PyTuple_Check(subobj)) {
13169 Py_ssize_t i;
13170 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013171 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013172 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013173 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013175 result = tailmatch(self, substring, start, end, +1);
13176 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013177 if (result == -1)
13178 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013179 if (result) {
13180 Py_RETURN_TRUE;
13181 }
13182 }
13183 Py_RETURN_FALSE;
13184 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013185 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013186 if (substring == NULL) {
13187 if (PyErr_ExceptionMatches(PyExc_TypeError))
13188 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13189 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013190 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013191 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013192 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013193 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013194 if (result == -1)
13195 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013196 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197}
13198
Victor Stinner202fdca2012-05-07 12:47:02 +020013199Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013200_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013201{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013202 if (!writer->readonly)
13203 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13204 else {
13205 /* Copy-on-write mode: set buffer size to 0 so
13206 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13207 * next write. */
13208 writer->size = 0;
13209 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013210 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13211 writer->data = PyUnicode_DATA(writer->buffer);
13212 writer->kind = PyUnicode_KIND(writer->buffer);
13213}
13214
Victor Stinnerd3f08822012-05-29 12:57:52 +020013215void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013216_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013217{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013218 memset(writer, 0, sizeof(*writer));
13219#ifdef Py_DEBUG
13220 writer->kind = 5; /* invalid kind */
13221#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013222 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013223}
13224
Victor Stinnerd3f08822012-05-29 12:57:52 +020013225int
13226_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13227 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013228{
Victor Stinner6989ba02013-11-18 21:08:39 +010013229#ifdef MS_WINDOWS
13230 /* On Windows, overallocate by 50% is the best factor */
13231# define OVERALLOCATE_FACTOR 2
13232#else
13233 /* On Linux, overallocate by 25% is the best factor */
13234# define OVERALLOCATE_FACTOR 4
13235#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013236 Py_ssize_t newlen;
13237 PyObject *newbuffer;
13238
Victor Stinnerd3f08822012-05-29 12:57:52 +020013239 assert(length > 0);
13240
Victor Stinner202fdca2012-05-07 12:47:02 +020013241 if (length > PY_SSIZE_T_MAX - writer->pos) {
13242 PyErr_NoMemory();
13243 return -1;
13244 }
13245 newlen = writer->pos + length;
13246
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013247 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013248
Victor Stinnerd3f08822012-05-29 12:57:52 +020013249 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013250 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013251 if (writer->overallocate
13252 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13253 /* overallocate to limit the number of realloc() */
13254 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013255 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013256 if (newlen < writer->min_length)
13257 newlen = writer->min_length;
13258
Victor Stinnerd3f08822012-05-29 12:57:52 +020013259 writer->buffer = PyUnicode_New(newlen, maxchar);
13260 if (writer->buffer == NULL)
13261 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013262 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013263 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013264 if (writer->overallocate
13265 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13266 /* overallocate to limit the number of realloc() */
13267 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013268 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013269 if (newlen < writer->min_length)
13270 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013271
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013272 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013273 /* resize + widen */
13274 newbuffer = PyUnicode_New(newlen, maxchar);
13275 if (newbuffer == NULL)
13276 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013277 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13278 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013279 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013280 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013281 }
13282 else {
13283 newbuffer = resize_compact(writer->buffer, newlen);
13284 if (newbuffer == NULL)
13285 return -1;
13286 }
13287 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013288 }
13289 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013290 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 newbuffer = PyUnicode_New(writer->size, maxchar);
13292 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013293 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013294 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13295 writer->buffer, 0, writer->pos);
13296 Py_DECREF(writer->buffer);
13297 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013298 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013299 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013300 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013301
13302#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013303}
13304
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013305Py_LOCAL_INLINE(int)
13306_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013307{
13308 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13309 return -1;
13310 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13311 writer->pos++;
13312 return 0;
13313}
13314
13315int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013316_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13317{
13318 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13319}
13320
13321int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013322_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13323{
13324 Py_UCS4 maxchar;
13325 Py_ssize_t len;
13326
13327 if (PyUnicode_READY(str) == -1)
13328 return -1;
13329 len = PyUnicode_GET_LENGTH(str);
13330 if (len == 0)
13331 return 0;
13332 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13333 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013334 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013335 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013336 Py_INCREF(str);
13337 writer->buffer = str;
13338 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013339 writer->pos += len;
13340 return 0;
13341 }
13342 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13343 return -1;
13344 }
13345 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13346 str, 0, len);
13347 writer->pos += len;
13348 return 0;
13349}
13350
Victor Stinnere215d962012-10-06 23:03:36 +020013351int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013352_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13353 Py_ssize_t start, Py_ssize_t end)
13354{
13355 Py_UCS4 maxchar;
13356 Py_ssize_t len;
13357
13358 if (PyUnicode_READY(str) == -1)
13359 return -1;
13360
13361 assert(0 <= start);
13362 assert(end <= PyUnicode_GET_LENGTH(str));
13363 assert(start <= end);
13364
13365 if (end == 0)
13366 return 0;
13367
13368 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13369 return _PyUnicodeWriter_WriteStr(writer, str);
13370
13371 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13372 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13373 else
13374 maxchar = writer->maxchar;
13375 len = end - start;
13376
13377 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13378 return -1;
13379
13380 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13381 str, start, len);
13382 writer->pos += len;
13383 return 0;
13384}
13385
13386int
Victor Stinner4a587072013-11-19 12:54:53 +010013387_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13388 const char *ascii, Py_ssize_t len)
13389{
13390 if (len == -1)
13391 len = strlen(ascii);
13392
13393 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13394
13395 if (writer->buffer == NULL && !writer->overallocate) {
13396 PyObject *str;
13397
13398 str = _PyUnicode_FromASCII(ascii, len);
13399 if (str == NULL)
13400 return -1;
13401
13402 writer->readonly = 1;
13403 writer->buffer = str;
13404 _PyUnicodeWriter_Update(writer);
13405 writer->pos += len;
13406 return 0;
13407 }
13408
13409 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13410 return -1;
13411
13412 switch (writer->kind)
13413 {
13414 case PyUnicode_1BYTE_KIND:
13415 {
13416 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13417 Py_UCS1 *data = writer->data;
13418
13419 Py_MEMCPY(data + writer->pos, str, len);
13420 break;
13421 }
13422 case PyUnicode_2BYTE_KIND:
13423 {
13424 _PyUnicode_CONVERT_BYTES(
13425 Py_UCS1, Py_UCS2,
13426 ascii, ascii + len,
13427 (Py_UCS2 *)writer->data + writer->pos);
13428 break;
13429 }
13430 case PyUnicode_4BYTE_KIND:
13431 {
13432 _PyUnicode_CONVERT_BYTES(
13433 Py_UCS1, Py_UCS4,
13434 ascii, ascii + len,
13435 (Py_UCS4 *)writer->data + writer->pos);
13436 break;
13437 }
13438 default:
13439 assert(0);
13440 }
13441
13442 writer->pos += len;
13443 return 0;
13444}
13445
13446int
13447_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13448 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013449{
13450 Py_UCS4 maxchar;
13451
13452 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13453 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13454 return -1;
13455 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13456 writer->pos += len;
13457 return 0;
13458}
13459
Victor Stinnerd3f08822012-05-29 12:57:52 +020013460PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013461_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013462{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013463 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013464 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013465 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013466 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013467 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013468 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013469 str = writer->buffer;
13470 writer->buffer = NULL;
13471 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13472 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013473 }
13474 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13475 PyObject *newbuffer;
13476 newbuffer = resize_compact(writer->buffer, writer->pos);
13477 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013478 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013479 return NULL;
13480 }
13481 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013482 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013483 str = writer->buffer;
13484 writer->buffer = NULL;
13485 assert(_PyUnicode_CheckConsistency(str, 1));
13486 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013487}
13488
Victor Stinnerd3f08822012-05-29 12:57:52 +020013489void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013490_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013491{
13492 Py_CLEAR(writer->buffer);
13493}
13494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013496
13497PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013498 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013499\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013500Return a formatted version of S, using substitutions from args and kwargs.\n\
13501The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013502
Eric Smith27bbca62010-11-04 17:06:58 +000013503PyDoc_STRVAR(format_map__doc__,
13504 "S.format_map(mapping) -> str\n\
13505\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013506Return a formatted version of S, using substitutions from mapping.\n\
13507The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013508
Eric Smith4a7d76d2008-05-30 18:10:19 +000013509static PyObject *
13510unicode__format__(PyObject* self, PyObject* args)
13511{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013512 PyObject *format_spec;
13513 _PyUnicodeWriter writer;
13514 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013515
13516 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13517 return NULL;
13518
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 if (PyUnicode_READY(self) == -1)
13520 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013521 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013522 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13523 self, format_spec, 0,
13524 PyUnicode_GET_LENGTH(format_spec));
13525 if (ret == -1) {
13526 _PyUnicodeWriter_Dealloc(&writer);
13527 return NULL;
13528 }
13529 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013530}
13531
Eric Smith8c663262007-08-25 02:26:07 +000013532PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013534\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013535Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013536
13537static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013538unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013540 Py_ssize_t size;
13541
13542 /* If it's a compact object, account for base structure +
13543 character data. */
13544 if (PyUnicode_IS_COMPACT_ASCII(v))
13545 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13546 else if (PyUnicode_IS_COMPACT(v))
13547 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013548 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013549 else {
13550 /* If it is a two-block object, account for base object, and
13551 for character block if present. */
13552 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013553 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013554 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013555 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013556 }
13557 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013558 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013559 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013560 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013561 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013562 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563
13564 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013565}
13566
13567PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013569
13570static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013571unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013572{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013573 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013574 if (!copy)
13575 return NULL;
13576 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013577}
13578
Guido van Rossumd57fd912000-03-10 22:53:23 +000013579static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013580 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013581 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013582 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13583 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013584 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13585 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013586 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013587 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13588 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13589 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013590 {"expandtabs", (PyCFunction) unicode_expandtabs,
13591 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013592 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013593 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013594 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13595 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13596 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013597 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013598 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13599 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13600 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013601 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013602 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013603 {"splitlines", (PyCFunction) unicode_splitlines,
13604 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013605 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013606 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13607 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13608 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13609 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13610 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13611 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13612 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13613 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13614 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13615 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13616 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13617 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13618 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13619 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013620 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013621 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013622 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013623 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013624 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013625 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013626 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013627 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013628#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013629 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013630 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013631#endif
13632
Benjamin Peterson14339b62009-01-31 16:36:08 +000013633 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013634 {NULL, NULL}
13635};
13636
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013637static PyObject *
13638unicode_mod(PyObject *v, PyObject *w)
13639{
Brian Curtindfc80e32011-08-10 20:28:54 -050013640 if (!PyUnicode_Check(v))
13641 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013642 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013643}
13644
13645static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013646 0, /*nb_add*/
13647 0, /*nb_subtract*/
13648 0, /*nb_multiply*/
13649 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013650};
13651
Guido van Rossumd57fd912000-03-10 22:53:23 +000013652static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013653 (lenfunc) unicode_length, /* sq_length */
13654 PyUnicode_Concat, /* sq_concat */
13655 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13656 (ssizeargfunc) unicode_getitem, /* sq_item */
13657 0, /* sq_slice */
13658 0, /* sq_ass_item */
13659 0, /* sq_ass_slice */
13660 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013661};
13662
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013663static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013664unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013665{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013666 if (PyUnicode_READY(self) == -1)
13667 return NULL;
13668
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013669 if (PyIndex_Check(item)) {
13670 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013671 if (i == -1 && PyErr_Occurred())
13672 return NULL;
13673 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013674 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013675 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013676 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013677 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013678 PyObject *result;
13679 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013680 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013681 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013683 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013684 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013685 return NULL;
13686 }
13687
13688 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013689 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013690 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013691 slicelength == PyUnicode_GET_LENGTH(self)) {
13692 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013693 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013694 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013695 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013696 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013697 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013698 src_kind = PyUnicode_KIND(self);
13699 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013700 if (!PyUnicode_IS_ASCII(self)) {
13701 kind_limit = kind_maxchar_limit(src_kind);
13702 max_char = 0;
13703 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13704 ch = PyUnicode_READ(src_kind, src_data, cur);
13705 if (ch > max_char) {
13706 max_char = ch;
13707 if (max_char >= kind_limit)
13708 break;
13709 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013710 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013711 }
Victor Stinner55c99112011-10-13 01:17:06 +020013712 else
13713 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013714 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013715 if (result == NULL)
13716 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013717 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013718 dest_data = PyUnicode_DATA(result);
13719
13720 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013721 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13722 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013723 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013724 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013725 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013726 } else {
13727 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13728 return NULL;
13729 }
13730}
13731
13732static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013733 (lenfunc)unicode_length, /* mp_length */
13734 (binaryfunc)unicode_subscript, /* mp_subscript */
13735 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013736};
13737
Guido van Rossumd57fd912000-03-10 22:53:23 +000013738
Guido van Rossumd57fd912000-03-10 22:53:23 +000013739/* Helpers for PyUnicode_Format() */
13740
Victor Stinnera47082312012-10-04 02:19:54 +020013741struct unicode_formatter_t {
13742 PyObject *args;
13743 int args_owned;
13744 Py_ssize_t arglen, argidx;
13745 PyObject *dict;
13746
13747 enum PyUnicode_Kind fmtkind;
13748 Py_ssize_t fmtcnt, fmtpos;
13749 void *fmtdata;
13750 PyObject *fmtstr;
13751
13752 _PyUnicodeWriter writer;
13753};
13754
13755struct unicode_format_arg_t {
13756 Py_UCS4 ch;
13757 int flags;
13758 Py_ssize_t width;
13759 int prec;
13760 int sign;
13761};
13762
Guido van Rossumd57fd912000-03-10 22:53:23 +000013763static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013764unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013765{
Victor Stinnera47082312012-10-04 02:19:54 +020013766 Py_ssize_t argidx = ctx->argidx;
13767
13768 if (argidx < ctx->arglen) {
13769 ctx->argidx++;
13770 if (ctx->arglen < 0)
13771 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013772 else
Victor Stinnera47082312012-10-04 02:19:54 +020013773 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774 }
13775 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013776 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013777 return NULL;
13778}
13779
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013780/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013781
Victor Stinnera47082312012-10-04 02:19:54 +020013782/* Format a float into the writer if the writer is not NULL, or into *p_output
13783 otherwise.
13784
13785 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013786static int
Victor Stinnera47082312012-10-04 02:19:54 +020013787formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13788 PyObject **p_output,
13789 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013790{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013791 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013792 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013793 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013794 int prec;
13795 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013796
Guido van Rossumd57fd912000-03-10 22:53:23 +000013797 x = PyFloat_AsDouble(v);
13798 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013799 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013800
Victor Stinnera47082312012-10-04 02:19:54 +020013801 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013802 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013803 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013804
Victor Stinnera47082312012-10-04 02:19:54 +020013805 if (arg->flags & F_ALT)
13806 dtoa_flags = Py_DTSF_ALT;
13807 else
13808 dtoa_flags = 0;
13809 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013810 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013811 return -1;
13812 len = strlen(p);
13813 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013814 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013815 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013816 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013817 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013818 }
13819 else
13820 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013821 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013822 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823}
13824
Victor Stinnerd0880d52012-04-27 23:40:13 +020013825/* formatlong() emulates the format codes d, u, o, x and X, and
13826 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13827 * Python's regular ints.
13828 * Return value: a new PyUnicodeObject*, or NULL if error.
13829 * The output string is of the form
13830 * "-"? ("0x" | "0X")? digit+
13831 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13832 * set in flags. The case of hex digits will be correct,
13833 * There will be at least prec digits, zero-filled on the left if
13834 * necessary to get that many.
13835 * val object to be converted
13836 * flags bitmask of format flags; only F_ALT is looked at
13837 * prec minimum number of digits; 0-fill on left if needed
13838 * type a character in [duoxX]; u acts the same as d
13839 *
13840 * CAUTION: o, x and X conversions on regular ints can never
13841 * produce a '-' sign, but can for Python's unbounded ints.
13842 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013843static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013844formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013845{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013846 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013847 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013848 Py_ssize_t i;
13849 int sign; /* 1 if '-', else 0 */
13850 int len; /* number of characters */
13851 Py_ssize_t llen;
13852 int numdigits; /* len == numnondigits + numdigits */
13853 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013854 int prec = arg->prec;
13855 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013856
Victor Stinnerd0880d52012-04-27 23:40:13 +020013857 /* Avoid exceeding SSIZE_T_MAX */
13858 if (prec > INT_MAX-3) {
13859 PyErr_SetString(PyExc_OverflowError,
13860 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013861 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013862 }
13863
13864 assert(PyLong_Check(val));
13865
13866 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013867 default:
13868 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013869 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013870 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013871 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013872 /* int and int subclasses should print numerically when a numeric */
13873 /* format code is used (see issue18780) */
13874 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013875 break;
13876 case 'o':
13877 numnondigits = 2;
13878 result = PyNumber_ToBase(val, 8);
13879 break;
13880 case 'x':
13881 case 'X':
13882 numnondigits = 2;
13883 result = PyNumber_ToBase(val, 16);
13884 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013885 }
13886 if (!result)
13887 return NULL;
13888
13889 assert(unicode_modifiable(result));
13890 assert(PyUnicode_IS_READY(result));
13891 assert(PyUnicode_IS_ASCII(result));
13892
13893 /* To modify the string in-place, there can only be one reference. */
13894 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013895 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013896 PyErr_BadInternalCall();
13897 return NULL;
13898 }
13899 buf = PyUnicode_DATA(result);
13900 llen = PyUnicode_GET_LENGTH(result);
13901 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013902 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013903 PyErr_SetString(PyExc_ValueError,
13904 "string too large in _PyBytes_FormatLong");
13905 return NULL;
13906 }
13907 len = (int)llen;
13908 sign = buf[0] == '-';
13909 numnondigits += sign;
13910 numdigits = len - numnondigits;
13911 assert(numdigits > 0);
13912
13913 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013914 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013915 (type == 'o' || type == 'x' || type == 'X'))) {
13916 assert(buf[sign] == '0');
13917 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13918 buf[sign+1] == 'o');
13919 numnondigits -= 2;
13920 buf += 2;
13921 len -= 2;
13922 if (sign)
13923 buf[0] = '-';
13924 assert(len == numnondigits + numdigits);
13925 assert(numdigits > 0);
13926 }
13927
13928 /* Fill with leading zeroes to meet minimum width. */
13929 if (prec > numdigits) {
13930 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13931 numnondigits + prec);
13932 char *b1;
13933 if (!r1) {
13934 Py_DECREF(result);
13935 return NULL;
13936 }
13937 b1 = PyBytes_AS_STRING(r1);
13938 for (i = 0; i < numnondigits; ++i)
13939 *b1++ = *buf++;
13940 for (i = 0; i < prec - numdigits; i++)
13941 *b1++ = '0';
13942 for (i = 0; i < numdigits; i++)
13943 *b1++ = *buf++;
13944 *b1 = '\0';
13945 Py_DECREF(result);
13946 result = r1;
13947 buf = PyBytes_AS_STRING(result);
13948 len = numnondigits + prec;
13949 }
13950
13951 /* Fix up case for hex conversions. */
13952 if (type == 'X') {
13953 /* Need to convert all lower case letters to upper case.
13954 and need to convert 0x to 0X (and -0x to -0X). */
13955 for (i = 0; i < len; i++)
13956 if (buf[i] >= 'a' && buf[i] <= 'x')
13957 buf[i] -= 'a'-'A';
13958 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013959 if (!PyUnicode_Check(result)
13960 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013961 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013962 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013963 Py_DECREF(result);
13964 result = unicode;
13965 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013966 else if (len != PyUnicode_GET_LENGTH(result)) {
13967 if (PyUnicode_Resize(&result, len) < 0)
13968 Py_CLEAR(result);
13969 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013970 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013971}
13972
Ethan Furmandf3ed242014-01-05 06:50:30 -080013973/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020013974 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013975 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013976 * -1 and raise an exception on error */
13977static int
Victor Stinnera47082312012-10-04 02:19:54 +020013978mainformatlong(PyObject *v,
13979 struct unicode_format_arg_t *arg,
13980 PyObject **p_output,
13981 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013982{
13983 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013984 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013985
13986 if (!PyNumber_Check(v))
13987 goto wrongtype;
13988
Ethan Furman9ab74802014-03-21 06:38:46 -070013989 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020013990 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080013991 if (type == 'o' || type == 'x' || type == 'X') {
13992 iobj = PyNumber_Index(v);
13993 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070013994 if (PyErr_ExceptionMatches(PyExc_TypeError))
13995 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070013996 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080013997 }
13998 }
13999 else {
14000 iobj = PyNumber_Long(v);
14001 if (iobj == NULL ) {
14002 if (PyErr_ExceptionMatches(PyExc_TypeError))
14003 goto wrongtype;
14004 return -1;
14005 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014006 }
14007 assert(PyLong_Check(iobj));
14008 }
14009 else {
14010 iobj = v;
14011 Py_INCREF(iobj);
14012 }
14013
14014 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014015 && arg->width == -1 && arg->prec == -1
14016 && !(arg->flags & (F_SIGN | F_BLANK))
14017 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014018 {
14019 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014020 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014021 int base;
14022
Victor Stinnera47082312012-10-04 02:19:54 +020014023 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014024 {
14025 default:
14026 assert(0 && "'type' not in [diuoxX]");
14027 case 'd':
14028 case 'i':
14029 case 'u':
14030 base = 10;
14031 break;
14032 case 'o':
14033 base = 8;
14034 break;
14035 case 'x':
14036 case 'X':
14037 base = 16;
14038 break;
14039 }
14040
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014041 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14042 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014043 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014044 }
14045 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014046 return 1;
14047 }
14048
Victor Stinnera47082312012-10-04 02:19:54 +020014049 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014050 Py_DECREF(iobj);
14051 if (res == NULL)
14052 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014053 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014054 return 0;
14055
14056wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014057 switch(type)
14058 {
14059 case 'o':
14060 case 'x':
14061 case 'X':
14062 PyErr_Format(PyExc_TypeError,
14063 "%%%c format: an integer is required, "
14064 "not %.200s",
14065 type, Py_TYPE(v)->tp_name);
14066 break;
14067 default:
14068 PyErr_Format(PyExc_TypeError,
14069 "%%%c format: a number is required, "
14070 "not %.200s",
14071 type, Py_TYPE(v)->tp_name);
14072 break;
14073 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014074 return -1;
14075}
14076
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014077static Py_UCS4
14078formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014079{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014080 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014081 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014082 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014083 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014084 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014085 goto onError;
14086 }
14087 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014088 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014089 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014090 /* make sure number is a type of integer */
14091 if (!PyLong_Check(v)) {
14092 iobj = PyNumber_Index(v);
14093 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014094 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014095 }
14096 v = iobj;
14097 Py_DECREF(iobj);
14098 }
14099 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014100 x = PyLong_AsLong(v);
14101 if (x == -1 && PyErr_Occurred())
14102 goto onError;
14103
Victor Stinner8faf8212011-12-08 22:14:11 +010014104 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014105 PyErr_SetString(PyExc_OverflowError,
14106 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014107 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014108 }
14109
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014110 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014111 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014112
Benjamin Peterson29060642009-01-31 22:14:21 +000014113 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014114 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014115 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014116 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014117}
14118
Victor Stinnera47082312012-10-04 02:19:54 +020014119/* Parse options of an argument: flags, width, precision.
14120 Handle also "%(name)" syntax.
14121
14122 Return 0 if the argument has been formatted into arg->str.
14123 Return 1 if the argument has been written into ctx->writer,
14124 Raise an exception and return -1 on error. */
14125static int
14126unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14127 struct unicode_format_arg_t *arg)
14128{
14129#define FORMAT_READ(ctx) \
14130 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14131
14132 PyObject *v;
14133
Victor Stinnera47082312012-10-04 02:19:54 +020014134 if (arg->ch == '(') {
14135 /* Get argument value from a dictionary. Example: "%(name)s". */
14136 Py_ssize_t keystart;
14137 Py_ssize_t keylen;
14138 PyObject *key;
14139 int pcount = 1;
14140
14141 if (ctx->dict == NULL) {
14142 PyErr_SetString(PyExc_TypeError,
14143 "format requires a mapping");
14144 return -1;
14145 }
14146 ++ctx->fmtpos;
14147 --ctx->fmtcnt;
14148 keystart = ctx->fmtpos;
14149 /* Skip over balanced parentheses */
14150 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14151 arg->ch = FORMAT_READ(ctx);
14152 if (arg->ch == ')')
14153 --pcount;
14154 else if (arg->ch == '(')
14155 ++pcount;
14156 ctx->fmtpos++;
14157 }
14158 keylen = ctx->fmtpos - keystart - 1;
14159 if (ctx->fmtcnt < 0 || pcount > 0) {
14160 PyErr_SetString(PyExc_ValueError,
14161 "incomplete format key");
14162 return -1;
14163 }
14164 key = PyUnicode_Substring(ctx->fmtstr,
14165 keystart, keystart + keylen);
14166 if (key == NULL)
14167 return -1;
14168 if (ctx->args_owned) {
14169 Py_DECREF(ctx->args);
14170 ctx->args_owned = 0;
14171 }
14172 ctx->args = PyObject_GetItem(ctx->dict, key);
14173 Py_DECREF(key);
14174 if (ctx->args == NULL)
14175 return -1;
14176 ctx->args_owned = 1;
14177 ctx->arglen = -1;
14178 ctx->argidx = -2;
14179 }
14180
14181 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014182 while (--ctx->fmtcnt >= 0) {
14183 arg->ch = FORMAT_READ(ctx);
14184 ctx->fmtpos++;
14185 switch (arg->ch) {
14186 case '-': arg->flags |= F_LJUST; continue;
14187 case '+': arg->flags |= F_SIGN; continue;
14188 case ' ': arg->flags |= F_BLANK; continue;
14189 case '#': arg->flags |= F_ALT; continue;
14190 case '0': arg->flags |= F_ZERO; continue;
14191 }
14192 break;
14193 }
14194
14195 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014196 if (arg->ch == '*') {
14197 v = unicode_format_getnextarg(ctx);
14198 if (v == NULL)
14199 return -1;
14200 if (!PyLong_Check(v)) {
14201 PyErr_SetString(PyExc_TypeError,
14202 "* wants int");
14203 return -1;
14204 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014205 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014206 if (arg->width == -1 && PyErr_Occurred())
14207 return -1;
14208 if (arg->width < 0) {
14209 arg->flags |= F_LJUST;
14210 arg->width = -arg->width;
14211 }
14212 if (--ctx->fmtcnt >= 0) {
14213 arg->ch = FORMAT_READ(ctx);
14214 ctx->fmtpos++;
14215 }
14216 }
14217 else if (arg->ch >= '0' && arg->ch <= '9') {
14218 arg->width = arg->ch - '0';
14219 while (--ctx->fmtcnt >= 0) {
14220 arg->ch = FORMAT_READ(ctx);
14221 ctx->fmtpos++;
14222 if (arg->ch < '0' || arg->ch > '9')
14223 break;
14224 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14225 mixing signed and unsigned comparison. Since arg->ch is between
14226 '0' and '9', casting to int is safe. */
14227 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14228 PyErr_SetString(PyExc_ValueError,
14229 "width too big");
14230 return -1;
14231 }
14232 arg->width = arg->width*10 + (arg->ch - '0');
14233 }
14234 }
14235
14236 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014237 if (arg->ch == '.') {
14238 arg->prec = 0;
14239 if (--ctx->fmtcnt >= 0) {
14240 arg->ch = FORMAT_READ(ctx);
14241 ctx->fmtpos++;
14242 }
14243 if (arg->ch == '*') {
14244 v = unicode_format_getnextarg(ctx);
14245 if (v == NULL)
14246 return -1;
14247 if (!PyLong_Check(v)) {
14248 PyErr_SetString(PyExc_TypeError,
14249 "* wants int");
14250 return -1;
14251 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014252 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014253 if (arg->prec == -1 && PyErr_Occurred())
14254 return -1;
14255 if (arg->prec < 0)
14256 arg->prec = 0;
14257 if (--ctx->fmtcnt >= 0) {
14258 arg->ch = FORMAT_READ(ctx);
14259 ctx->fmtpos++;
14260 }
14261 }
14262 else if (arg->ch >= '0' && arg->ch <= '9') {
14263 arg->prec = arg->ch - '0';
14264 while (--ctx->fmtcnt >= 0) {
14265 arg->ch = FORMAT_READ(ctx);
14266 ctx->fmtpos++;
14267 if (arg->ch < '0' || arg->ch > '9')
14268 break;
14269 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14270 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014271 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014272 return -1;
14273 }
14274 arg->prec = arg->prec*10 + (arg->ch - '0');
14275 }
14276 }
14277 }
14278
14279 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14280 if (ctx->fmtcnt >= 0) {
14281 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14282 if (--ctx->fmtcnt >= 0) {
14283 arg->ch = FORMAT_READ(ctx);
14284 ctx->fmtpos++;
14285 }
14286 }
14287 }
14288 if (ctx->fmtcnt < 0) {
14289 PyErr_SetString(PyExc_ValueError,
14290 "incomplete format");
14291 return -1;
14292 }
14293 return 0;
14294
14295#undef FORMAT_READ
14296}
14297
14298/* Format one argument. Supported conversion specifiers:
14299
14300 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014301 - "i", "d", "u": int or float
14302 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014303 - "e", "E", "f", "F", "g", "G": float
14304 - "c": int or str (1 character)
14305
Victor Stinner8dbd4212012-12-04 09:30:24 +010014306 When possible, the output is written directly into the Unicode writer
14307 (ctx->writer). A string is created when padding is required.
14308
Victor Stinnera47082312012-10-04 02:19:54 +020014309 Return 0 if the argument has been formatted into *p_str,
14310 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014311 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014312static int
14313unicode_format_arg_format(struct unicode_formatter_t *ctx,
14314 struct unicode_format_arg_t *arg,
14315 PyObject **p_str)
14316{
14317 PyObject *v;
14318 _PyUnicodeWriter *writer = &ctx->writer;
14319
14320 if (ctx->fmtcnt == 0)
14321 ctx->writer.overallocate = 0;
14322
14323 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014324 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014325 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014326 return 1;
14327 }
14328
14329 v = unicode_format_getnextarg(ctx);
14330 if (v == NULL)
14331 return -1;
14332
Victor Stinnera47082312012-10-04 02:19:54 +020014333
14334 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014335 case 's':
14336 case 'r':
14337 case 'a':
14338 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14339 /* Fast path */
14340 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14341 return -1;
14342 return 1;
14343 }
14344
14345 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14346 *p_str = v;
14347 Py_INCREF(*p_str);
14348 }
14349 else {
14350 if (arg->ch == 's')
14351 *p_str = PyObject_Str(v);
14352 else if (arg->ch == 'r')
14353 *p_str = PyObject_Repr(v);
14354 else
14355 *p_str = PyObject_ASCII(v);
14356 }
14357 break;
14358
14359 case 'i':
14360 case 'd':
14361 case 'u':
14362 case 'o':
14363 case 'x':
14364 case 'X':
14365 {
14366 int ret = mainformatlong(v, arg, p_str, writer);
14367 if (ret != 0)
14368 return ret;
14369 arg->sign = 1;
14370 break;
14371 }
14372
14373 case 'e':
14374 case 'E':
14375 case 'f':
14376 case 'F':
14377 case 'g':
14378 case 'G':
14379 if (arg->width == -1 && arg->prec == -1
14380 && !(arg->flags & (F_SIGN | F_BLANK)))
14381 {
14382 /* Fast path */
14383 if (formatfloat(v, arg, NULL, writer) == -1)
14384 return -1;
14385 return 1;
14386 }
14387
14388 arg->sign = 1;
14389 if (formatfloat(v, arg, p_str, NULL) == -1)
14390 return -1;
14391 break;
14392
14393 case 'c':
14394 {
14395 Py_UCS4 ch = formatchar(v);
14396 if (ch == (Py_UCS4) -1)
14397 return -1;
14398 if (arg->width == -1 && arg->prec == -1) {
14399 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014400 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014401 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014402 return 1;
14403 }
14404 *p_str = PyUnicode_FromOrdinal(ch);
14405 break;
14406 }
14407
14408 default:
14409 PyErr_Format(PyExc_ValueError,
14410 "unsupported format character '%c' (0x%x) "
14411 "at index %zd",
14412 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14413 (int)arg->ch,
14414 ctx->fmtpos - 1);
14415 return -1;
14416 }
14417 if (*p_str == NULL)
14418 return -1;
14419 assert (PyUnicode_Check(*p_str));
14420 return 0;
14421}
14422
14423static int
14424unicode_format_arg_output(struct unicode_formatter_t *ctx,
14425 struct unicode_format_arg_t *arg,
14426 PyObject *str)
14427{
14428 Py_ssize_t len;
14429 enum PyUnicode_Kind kind;
14430 void *pbuf;
14431 Py_ssize_t pindex;
14432 Py_UCS4 signchar;
14433 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014434 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014435 Py_ssize_t sublen;
14436 _PyUnicodeWriter *writer = &ctx->writer;
14437 Py_UCS4 fill;
14438
14439 fill = ' ';
14440 if (arg->sign && arg->flags & F_ZERO)
14441 fill = '0';
14442
14443 if (PyUnicode_READY(str) == -1)
14444 return -1;
14445
14446 len = PyUnicode_GET_LENGTH(str);
14447 if ((arg->width == -1 || arg->width <= len)
14448 && (arg->prec == -1 || arg->prec >= len)
14449 && !(arg->flags & (F_SIGN | F_BLANK)))
14450 {
14451 /* Fast path */
14452 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14453 return -1;
14454 return 0;
14455 }
14456
14457 /* Truncate the string for "s", "r" and "a" formats
14458 if the precision is set */
14459 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14460 if (arg->prec >= 0 && len > arg->prec)
14461 len = arg->prec;
14462 }
14463
14464 /* Adjust sign and width */
14465 kind = PyUnicode_KIND(str);
14466 pbuf = PyUnicode_DATA(str);
14467 pindex = 0;
14468 signchar = '\0';
14469 if (arg->sign) {
14470 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14471 if (ch == '-' || ch == '+') {
14472 signchar = ch;
14473 len--;
14474 pindex++;
14475 }
14476 else if (arg->flags & F_SIGN)
14477 signchar = '+';
14478 else if (arg->flags & F_BLANK)
14479 signchar = ' ';
14480 else
14481 arg->sign = 0;
14482 }
14483 if (arg->width < len)
14484 arg->width = len;
14485
14486 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014487 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014488 if (!(arg->flags & F_LJUST)) {
14489 if (arg->sign) {
14490 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014491 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014492 }
14493 else {
14494 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014495 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014496 }
14497 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014498 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14499 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014500 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014501 }
14502
Victor Stinnera47082312012-10-04 02:19:54 +020014503 buflen = arg->width;
14504 if (arg->sign && len == arg->width)
14505 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014506 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014507 return -1;
14508
14509 /* Write the sign if needed */
14510 if (arg->sign) {
14511 if (fill != ' ') {
14512 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14513 writer->pos += 1;
14514 }
14515 if (arg->width > len)
14516 arg->width--;
14517 }
14518
14519 /* Write the numeric prefix for "x", "X" and "o" formats
14520 if the alternate form is used.
14521 For example, write "0x" for the "%#x" format. */
14522 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14523 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14524 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14525 if (fill != ' ') {
14526 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14527 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14528 writer->pos += 2;
14529 pindex += 2;
14530 }
14531 arg->width -= 2;
14532 if (arg->width < 0)
14533 arg->width = 0;
14534 len -= 2;
14535 }
14536
14537 /* Pad left with the fill character if needed */
14538 if (arg->width > len && !(arg->flags & F_LJUST)) {
14539 sublen = arg->width - len;
14540 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14541 writer->pos += sublen;
14542 arg->width = len;
14543 }
14544
14545 /* If padding with spaces: write sign if needed and/or numeric prefix if
14546 the alternate form is used */
14547 if (fill == ' ') {
14548 if (arg->sign) {
14549 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14550 writer->pos += 1;
14551 }
14552 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14553 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14554 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14555 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14556 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14557 writer->pos += 2;
14558 pindex += 2;
14559 }
14560 }
14561
14562 /* Write characters */
14563 if (len) {
14564 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14565 str, pindex, len);
14566 writer->pos += len;
14567 }
14568
14569 /* Pad right with the fill character if needed */
14570 if (arg->width > len) {
14571 sublen = arg->width - len;
14572 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14573 writer->pos += sublen;
14574 }
14575 return 0;
14576}
14577
14578/* Helper of PyUnicode_Format(): format one arg.
14579 Return 0 on success, raise an exception and return -1 on error. */
14580static int
14581unicode_format_arg(struct unicode_formatter_t *ctx)
14582{
14583 struct unicode_format_arg_t arg;
14584 PyObject *str;
14585 int ret;
14586
Victor Stinner8dbd4212012-12-04 09:30:24 +010014587 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14588 arg.flags = 0;
14589 arg.width = -1;
14590 arg.prec = -1;
14591 arg.sign = 0;
14592 str = NULL;
14593
Victor Stinnera47082312012-10-04 02:19:54 +020014594 ret = unicode_format_arg_parse(ctx, &arg);
14595 if (ret == -1)
14596 return -1;
14597
14598 ret = unicode_format_arg_format(ctx, &arg, &str);
14599 if (ret == -1)
14600 return -1;
14601
14602 if (ret != 1) {
14603 ret = unicode_format_arg_output(ctx, &arg, str);
14604 Py_DECREF(str);
14605 if (ret == -1)
14606 return -1;
14607 }
14608
14609 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14610 PyErr_SetString(PyExc_TypeError,
14611 "not all arguments converted during string formatting");
14612 return -1;
14613 }
14614 return 0;
14615}
14616
Alexander Belopolsky40018472011-02-26 01:02:56 +000014617PyObject *
14618PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014619{
Victor Stinnera47082312012-10-04 02:19:54 +020014620 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014621
Guido van Rossumd57fd912000-03-10 22:53:23 +000014622 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014623 PyErr_BadInternalCall();
14624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014625 }
Victor Stinnera47082312012-10-04 02:19:54 +020014626
14627 ctx.fmtstr = PyUnicode_FromObject(format);
14628 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014629 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014630 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14631 Py_DECREF(ctx.fmtstr);
14632 return NULL;
14633 }
14634 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14635 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14636 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14637 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014638
Victor Stinner8f674cc2013-04-17 23:02:17 +020014639 _PyUnicodeWriter_Init(&ctx.writer);
14640 ctx.writer.min_length = ctx.fmtcnt + 100;
14641 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014642
Guido van Rossumd57fd912000-03-10 22:53:23 +000014643 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014644 ctx.arglen = PyTuple_Size(args);
14645 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014646 }
14647 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014648 ctx.arglen = -1;
14649 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014650 }
Victor Stinnera47082312012-10-04 02:19:54 +020014651 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014652 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014653 ctx.dict = args;
14654 else
14655 ctx.dict = NULL;
14656 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014657
Victor Stinnera47082312012-10-04 02:19:54 +020014658 while (--ctx.fmtcnt >= 0) {
14659 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014660 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014661
14662 nonfmtpos = ctx.fmtpos++;
14663 while (ctx.fmtcnt >= 0 &&
14664 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14665 ctx.fmtpos++;
14666 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014667 }
Victor Stinnera47082312012-10-04 02:19:54 +020014668 if (ctx.fmtcnt < 0) {
14669 ctx.fmtpos--;
14670 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014671 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014672
Victor Stinnercfc4c132013-04-03 01:48:39 +020014673 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14674 nonfmtpos, ctx.fmtpos) < 0)
14675 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014676 }
14677 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014678 ctx.fmtpos++;
14679 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014680 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014681 }
14682 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014683
Victor Stinnera47082312012-10-04 02:19:54 +020014684 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014685 PyErr_SetString(PyExc_TypeError,
14686 "not all arguments converted during string formatting");
14687 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014688 }
14689
Victor Stinnera47082312012-10-04 02:19:54 +020014690 if (ctx.args_owned) {
14691 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014692 }
Victor Stinnera47082312012-10-04 02:19:54 +020014693 Py_DECREF(ctx.fmtstr);
14694 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014695
Benjamin Peterson29060642009-01-31 22:14:21 +000014696 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014697 Py_DECREF(ctx.fmtstr);
14698 _PyUnicodeWriter_Dealloc(&ctx.writer);
14699 if (ctx.args_owned) {
14700 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014701 }
14702 return NULL;
14703}
14704
Jeremy Hylton938ace62002-07-17 16:30:39 +000014705static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014706unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14707
Tim Peters6d6c1a32001-08-02 04:15:00 +000014708static PyObject *
14709unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14710{
Benjamin Peterson29060642009-01-31 22:14:21 +000014711 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014712 static char *kwlist[] = {"object", "encoding", "errors", 0};
14713 char *encoding = NULL;
14714 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014715
Benjamin Peterson14339b62009-01-31 16:36:08 +000014716 if (type != &PyUnicode_Type)
14717 return unicode_subtype_new(type, args, kwds);
14718 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014719 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014720 return NULL;
14721 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014722 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014723 if (encoding == NULL && errors == NULL)
14724 return PyObject_Str(x);
14725 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014726 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014727}
14728
Guido van Rossume023fe02001-08-30 03:12:59 +000014729static PyObject *
14730unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14731{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014732 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014733 Py_ssize_t length, char_size;
14734 int share_wstr, share_utf8;
14735 unsigned int kind;
14736 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014737
Benjamin Peterson14339b62009-01-31 16:36:08 +000014738 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014739
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014740 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014741 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014742 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014743 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014744 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014745 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014746 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014747 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014748
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014749 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014750 if (self == NULL) {
14751 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014752 return NULL;
14753 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014754 kind = PyUnicode_KIND(unicode);
14755 length = PyUnicode_GET_LENGTH(unicode);
14756
14757 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014758#ifdef Py_DEBUG
14759 _PyUnicode_HASH(self) = -1;
14760#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014761 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014762#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014763 _PyUnicode_STATE(self).interned = 0;
14764 _PyUnicode_STATE(self).kind = kind;
14765 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014766 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014767 _PyUnicode_STATE(self).ready = 1;
14768 _PyUnicode_WSTR(self) = NULL;
14769 _PyUnicode_UTF8_LENGTH(self) = 0;
14770 _PyUnicode_UTF8(self) = NULL;
14771 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014772 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014773
14774 share_utf8 = 0;
14775 share_wstr = 0;
14776 if (kind == PyUnicode_1BYTE_KIND) {
14777 char_size = 1;
14778 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14779 share_utf8 = 1;
14780 }
14781 else if (kind == PyUnicode_2BYTE_KIND) {
14782 char_size = 2;
14783 if (sizeof(wchar_t) == 2)
14784 share_wstr = 1;
14785 }
14786 else {
14787 assert(kind == PyUnicode_4BYTE_KIND);
14788 char_size = 4;
14789 if (sizeof(wchar_t) == 4)
14790 share_wstr = 1;
14791 }
14792
14793 /* Ensure we won't overflow the length. */
14794 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14795 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014796 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014797 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014798 data = PyObject_MALLOC((length + 1) * char_size);
14799 if (data == NULL) {
14800 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014801 goto onError;
14802 }
14803
Victor Stinnerc3c74152011-10-02 20:39:55 +020014804 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014805 if (share_utf8) {
14806 _PyUnicode_UTF8_LENGTH(self) = length;
14807 _PyUnicode_UTF8(self) = data;
14808 }
14809 if (share_wstr) {
14810 _PyUnicode_WSTR_LENGTH(self) = length;
14811 _PyUnicode_WSTR(self) = (wchar_t *)data;
14812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014813
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014814 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014815 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014816 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014817#ifdef Py_DEBUG
14818 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14819#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014820 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014821 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014822
14823onError:
14824 Py_DECREF(unicode);
14825 Py_DECREF(self);
14826 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014827}
14828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014829PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014830"str(object='') -> str\n\
14831str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014832\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014833Create a new string object from the given object. If encoding or\n\
14834errors is specified, then the object must expose a data buffer\n\
14835that will be decoded using the given encoding and error handler.\n\
14836Otherwise, returns the result of object.__str__() (if defined)\n\
14837or repr(object).\n\
14838encoding defaults to sys.getdefaultencoding().\n\
14839errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014840
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014841static PyObject *unicode_iter(PyObject *seq);
14842
Guido van Rossumd57fd912000-03-10 22:53:23 +000014843PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014844 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014845 "str", /* tp_name */
14846 sizeof(PyUnicodeObject), /* tp_size */
14847 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014848 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014849 (destructor)unicode_dealloc, /* tp_dealloc */
14850 0, /* tp_print */
14851 0, /* tp_getattr */
14852 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014853 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014854 unicode_repr, /* tp_repr */
14855 &unicode_as_number, /* tp_as_number */
14856 &unicode_as_sequence, /* tp_as_sequence */
14857 &unicode_as_mapping, /* tp_as_mapping */
14858 (hashfunc) unicode_hash, /* tp_hash*/
14859 0, /* tp_call*/
14860 (reprfunc) unicode_str, /* tp_str */
14861 PyObject_GenericGetAttr, /* tp_getattro */
14862 0, /* tp_setattro */
14863 0, /* tp_as_buffer */
14864 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014865 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014866 unicode_doc, /* tp_doc */
14867 0, /* tp_traverse */
14868 0, /* tp_clear */
14869 PyUnicode_RichCompare, /* tp_richcompare */
14870 0, /* tp_weaklistoffset */
14871 unicode_iter, /* tp_iter */
14872 0, /* tp_iternext */
14873 unicode_methods, /* tp_methods */
14874 0, /* tp_members */
14875 0, /* tp_getset */
14876 &PyBaseObject_Type, /* tp_base */
14877 0, /* tp_dict */
14878 0, /* tp_descr_get */
14879 0, /* tp_descr_set */
14880 0, /* tp_dictoffset */
14881 0, /* tp_init */
14882 0, /* tp_alloc */
14883 unicode_new, /* tp_new */
14884 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014885};
14886
14887/* Initialize the Unicode implementation */
14888
Victor Stinner3a50e702011-10-18 21:21:00 +020014889int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014890{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014891 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014892 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014893 0x000A, /* LINE FEED */
14894 0x000D, /* CARRIAGE RETURN */
14895 0x001C, /* FILE SEPARATOR */
14896 0x001D, /* GROUP SEPARATOR */
14897 0x001E, /* RECORD SEPARATOR */
14898 0x0085, /* NEXT LINE */
14899 0x2028, /* LINE SEPARATOR */
14900 0x2029, /* PARAGRAPH SEPARATOR */
14901 };
14902
Fred Drakee4315f52000-05-09 19:53:39 +000014903 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014904 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014905 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014906 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014907 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014908
Guido van Rossumcacfc072002-05-24 19:01:59 +000014909 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014910 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014911
14912 /* initialize the linebreak bloom filter */
14913 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014914 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014915 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014916
Christian Heimes26532f72013-07-20 14:57:16 +020014917 if (PyType_Ready(&EncodingMapType) < 0)
14918 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014919
Benjamin Petersonc4311282012-10-30 23:21:10 -040014920 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14921 Py_FatalError("Can't initialize field name iterator type");
14922
14923 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14924 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014925
Victor Stinner3a50e702011-10-18 21:21:00 +020014926#ifdef HAVE_MBCS
14927 winver.dwOSVersionInfoSize = sizeof(winver);
14928 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14929 PyErr_SetFromWindowsErr(0);
14930 return -1;
14931 }
14932#endif
14933 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014934}
14935
14936/* Finalize the Unicode implementation */
14937
Christian Heimesa156e092008-02-16 07:38:31 +000014938int
14939PyUnicode_ClearFreeList(void)
14940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014941 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014942}
14943
Guido van Rossumd57fd912000-03-10 22:53:23 +000014944void
Thomas Wouters78890102000-07-22 19:25:51 +000014945_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014947 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014948
Serhiy Storchaka05997252013-01-26 12:14:02 +020014949 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014950
Serhiy Storchaka05997252013-01-26 12:14:02 +020014951 for (i = 0; i < 256; i++)
14952 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014953 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014954 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014955}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014956
Walter Dörwald16807132007-05-25 13:52:07 +000014957void
14958PyUnicode_InternInPlace(PyObject **p)
14959{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014960 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014961 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014962#ifdef Py_DEBUG
14963 assert(s != NULL);
14964 assert(_PyUnicode_CHECK(s));
14965#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014966 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014967 return;
14968#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014969 /* If it's a subclass, we don't really know what putting
14970 it in the interned dict might do. */
14971 if (!PyUnicode_CheckExact(s))
14972 return;
14973 if (PyUnicode_CHECK_INTERNED(s))
14974 return;
14975 if (interned == NULL) {
14976 interned = PyDict_New();
14977 if (interned == NULL) {
14978 PyErr_Clear(); /* Don't leave an exception */
14979 return;
14980 }
14981 }
14982 /* It might be that the GetItem call fails even
14983 though the key is present in the dictionary,
14984 namely when this happens during a stack overflow. */
14985 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014986 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014987 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014988
Victor Stinnerf0335102013-04-14 19:13:03 +020014989 if (t) {
14990 Py_INCREF(t);
14991 Py_DECREF(*p);
14992 *p = t;
14993 return;
14994 }
Walter Dörwald16807132007-05-25 13:52:07 +000014995
Benjamin Peterson14339b62009-01-31 16:36:08 +000014996 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014997 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014998 PyErr_Clear();
14999 PyThreadState_GET()->recursion_critical = 0;
15000 return;
15001 }
15002 PyThreadState_GET()->recursion_critical = 0;
15003 /* The two references in interned are not counted by refcnt.
15004 The deallocator will take care of this */
15005 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015006 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015007}
15008
15009void
15010PyUnicode_InternImmortal(PyObject **p)
15011{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015012 PyUnicode_InternInPlace(p);
15013 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015014 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015015 Py_INCREF(*p);
15016 }
Walter Dörwald16807132007-05-25 13:52:07 +000015017}
15018
15019PyObject *
15020PyUnicode_InternFromString(const char *cp)
15021{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015022 PyObject *s = PyUnicode_FromString(cp);
15023 if (s == NULL)
15024 return NULL;
15025 PyUnicode_InternInPlace(&s);
15026 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015027}
15028
Alexander Belopolsky40018472011-02-26 01:02:56 +000015029void
15030_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015031{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015032 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015033 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015034 Py_ssize_t i, n;
15035 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015036
Benjamin Peterson14339b62009-01-31 16:36:08 +000015037 if (interned == NULL || !PyDict_Check(interned))
15038 return;
15039 keys = PyDict_Keys(interned);
15040 if (keys == NULL || !PyList_Check(keys)) {
15041 PyErr_Clear();
15042 return;
15043 }
Walter Dörwald16807132007-05-25 13:52:07 +000015044
Benjamin Peterson14339b62009-01-31 16:36:08 +000015045 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15046 detector, interned unicode strings are not forcibly deallocated;
15047 rather, we give them their stolen references back, and then clear
15048 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015049
Benjamin Peterson14339b62009-01-31 16:36:08 +000015050 n = PyList_GET_SIZE(keys);
15051 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015052 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015053 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015054 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015055 if (PyUnicode_READY(s) == -1) {
15056 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015057 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015059 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 case SSTATE_NOT_INTERNED:
15061 /* XXX Shouldn't happen */
15062 break;
15063 case SSTATE_INTERNED_IMMORTAL:
15064 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015065 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015066 break;
15067 case SSTATE_INTERNED_MORTAL:
15068 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015069 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015070 break;
15071 default:
15072 Py_FatalError("Inconsistent interned string state.");
15073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015074 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015075 }
15076 fprintf(stderr, "total size of all interned strings: "
15077 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15078 "mortal/immortal\n", mortal_size, immortal_size);
15079 Py_DECREF(keys);
15080 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015081 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015082}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015083
15084
15085/********************* Unicode Iterator **************************/
15086
15087typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015088 PyObject_HEAD
15089 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015090 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015091} unicodeiterobject;
15092
15093static void
15094unicodeiter_dealloc(unicodeiterobject *it)
15095{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015096 _PyObject_GC_UNTRACK(it);
15097 Py_XDECREF(it->it_seq);
15098 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015099}
15100
15101static int
15102unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15103{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015104 Py_VISIT(it->it_seq);
15105 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015106}
15107
15108static PyObject *
15109unicodeiter_next(unicodeiterobject *it)
15110{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015111 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015112
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 assert(it != NULL);
15114 seq = it->it_seq;
15115 if (seq == NULL)
15116 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015117 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015119 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15120 int kind = PyUnicode_KIND(seq);
15121 void *data = PyUnicode_DATA(seq);
15122 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15123 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015124 if (item != NULL)
15125 ++it->it_index;
15126 return item;
15127 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015128
Benjamin Peterson14339b62009-01-31 16:36:08 +000015129 Py_DECREF(seq);
15130 it->it_seq = NULL;
15131 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015132}
15133
15134static PyObject *
15135unicodeiter_len(unicodeiterobject *it)
15136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 Py_ssize_t len = 0;
15138 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015139 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015140 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015141}
15142
15143PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15144
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015145static PyObject *
15146unicodeiter_reduce(unicodeiterobject *it)
15147{
15148 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015149 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015150 it->it_seq, it->it_index);
15151 } else {
15152 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15153 if (u == NULL)
15154 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015155 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015156 }
15157}
15158
15159PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15160
15161static PyObject *
15162unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15163{
15164 Py_ssize_t index = PyLong_AsSsize_t(state);
15165 if (index == -1 && PyErr_Occurred())
15166 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015167 if (it->it_seq != NULL) {
15168 if (index < 0)
15169 index = 0;
15170 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15171 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15172 it->it_index = index;
15173 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015174 Py_RETURN_NONE;
15175}
15176
15177PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15178
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015179static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015181 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015182 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15183 reduce_doc},
15184 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15185 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015187};
15188
15189PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015190 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15191 "str_iterator", /* tp_name */
15192 sizeof(unicodeiterobject), /* tp_basicsize */
15193 0, /* tp_itemsize */
15194 /* methods */
15195 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15196 0, /* tp_print */
15197 0, /* tp_getattr */
15198 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015199 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015200 0, /* tp_repr */
15201 0, /* tp_as_number */
15202 0, /* tp_as_sequence */
15203 0, /* tp_as_mapping */
15204 0, /* tp_hash */
15205 0, /* tp_call */
15206 0, /* tp_str */
15207 PyObject_GenericGetAttr, /* tp_getattro */
15208 0, /* tp_setattro */
15209 0, /* tp_as_buffer */
15210 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15211 0, /* tp_doc */
15212 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15213 0, /* tp_clear */
15214 0, /* tp_richcompare */
15215 0, /* tp_weaklistoffset */
15216 PyObject_SelfIter, /* tp_iter */
15217 (iternextfunc)unicodeiter_next, /* tp_iternext */
15218 unicodeiter_methods, /* tp_methods */
15219 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015220};
15221
15222static PyObject *
15223unicode_iter(PyObject *seq)
15224{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015225 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015226
Benjamin Peterson14339b62009-01-31 16:36:08 +000015227 if (!PyUnicode_Check(seq)) {
15228 PyErr_BadInternalCall();
15229 return NULL;
15230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015231 if (PyUnicode_READY(seq) == -1)
15232 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015233 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15234 if (it == NULL)
15235 return NULL;
15236 it->it_index = 0;
15237 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015238 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015239 _PyObject_GC_TRACK(it);
15240 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015241}
15242
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015243
15244size_t
15245Py_UNICODE_strlen(const Py_UNICODE *u)
15246{
15247 int res = 0;
15248 while(*u++)
15249 res++;
15250 return res;
15251}
15252
15253Py_UNICODE*
15254Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15255{
15256 Py_UNICODE *u = s1;
15257 while ((*u++ = *s2++));
15258 return s1;
15259}
15260
15261Py_UNICODE*
15262Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15263{
15264 Py_UNICODE *u = s1;
15265 while ((*u++ = *s2++))
15266 if (n-- == 0)
15267 break;
15268 return s1;
15269}
15270
15271Py_UNICODE*
15272Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15273{
15274 Py_UNICODE *u1 = s1;
15275 u1 += Py_UNICODE_strlen(u1);
15276 Py_UNICODE_strcpy(u1, s2);
15277 return s1;
15278}
15279
15280int
15281Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15282{
15283 while (*s1 && *s2 && *s1 == *s2)
15284 s1++, s2++;
15285 if (*s1 && *s2)
15286 return (*s1 < *s2) ? -1 : +1;
15287 if (*s1)
15288 return 1;
15289 if (*s2)
15290 return -1;
15291 return 0;
15292}
15293
15294int
15295Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15296{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015297 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015298 for (; n != 0; n--) {
15299 u1 = *s1;
15300 u2 = *s2;
15301 if (u1 != u2)
15302 return (u1 < u2) ? -1 : +1;
15303 if (u1 == '\0')
15304 return 0;
15305 s1++;
15306 s2++;
15307 }
15308 return 0;
15309}
15310
15311Py_UNICODE*
15312Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15313{
15314 const Py_UNICODE *p;
15315 for (p = s; *p; p++)
15316 if (*p == c)
15317 return (Py_UNICODE*)p;
15318 return NULL;
15319}
15320
15321Py_UNICODE*
15322Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15323{
15324 const Py_UNICODE *p;
15325 p = s + Py_UNICODE_strlen(s);
15326 while (p != s) {
15327 p--;
15328 if (*p == c)
15329 return (Py_UNICODE*)p;
15330 }
15331 return NULL;
15332}
Victor Stinner331ea922010-08-10 16:37:20 +000015333
Victor Stinner71133ff2010-09-01 23:43:53 +000015334Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015335PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015336{
Victor Stinner577db2c2011-10-11 22:12:48 +020015337 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015338 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015340 if (!PyUnicode_Check(unicode)) {
15341 PyErr_BadArgument();
15342 return NULL;
15343 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015344 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015345 if (u == NULL)
15346 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015347 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015348 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015349 PyErr_NoMemory();
15350 return NULL;
15351 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015352 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015353 size *= sizeof(Py_UNICODE);
15354 copy = PyMem_Malloc(size);
15355 if (copy == NULL) {
15356 PyErr_NoMemory();
15357 return NULL;
15358 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015359 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015360 return copy;
15361}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015362
Georg Brandl66c221e2010-10-14 07:04:07 +000015363/* A _string module, to export formatter_parser and formatter_field_name_split
15364 to the string.Formatter class implemented in Python. */
15365
15366static PyMethodDef _string_methods[] = {
15367 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15368 METH_O, PyDoc_STR("split the argument as a field name")},
15369 {"formatter_parser", (PyCFunction) formatter_parser,
15370 METH_O, PyDoc_STR("parse the argument as a format string")},
15371 {NULL, NULL}
15372};
15373
15374static struct PyModuleDef _string_module = {
15375 PyModuleDef_HEAD_INIT,
15376 "_string",
15377 PyDoc_STR("string helper module"),
15378 0,
15379 _string_methods,
15380 NULL,
15381 NULL,
15382 NULL,
15383 NULL
15384};
15385
15386PyMODINIT_FUNC
15387PyInit__string(void)
15388{
15389 return PyModule_Create(&_string_module);
15390}
15391
15392
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015393#ifdef __cplusplus
15394}
15395#endif