blob: 0386a871253696a28a9048d29a5249d116ad8a3c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001014 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1015
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 if (ascii->wstr == data)
1017 printf("shared ");
1018 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001019
Victor Stinnera3b334d2011-10-03 13:53:37 +02001020 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(" (%zu), ", compact->wstr_length);
1022 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1023 printf("shared ");
1024 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
1373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
1375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001536 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1537 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyErr_NoMemory();
1539 return -1;
1540 }
1541 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1542 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001543 _PyUnicode_UTF8(unicode) = NULL;
1544 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001545 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1546 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001547 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 PyObject_FREE(_PyUnicode_WSTR(unicode));
1549 _PyUnicode_WSTR(unicode) = NULL;
1550 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1551#else
1552 assert(num_surrogates == 0);
1553
Victor Stinnerc3c74152011-10-02 20:39:55 +02001554 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001556 _PyUnicode_UTF8(unicode) = NULL;
1557 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1559#endif
1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1561 }
1562 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001563 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 return 0;
1565}
1566
Alexander Belopolsky40018472011-02-26 01:02:56 +00001567static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001568unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569{
Walter Dörwald16807132007-05-25 13:52:07 +00001570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 case SSTATE_NOT_INTERNED:
1572 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 case SSTATE_INTERNED_MORTAL:
1575 /* revive dead object temporarily for DelItem */
1576 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001577 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 Py_FatalError(
1579 "deletion of interned string failed");
1580 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_INTERNED_IMMORTAL:
1583 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 default:
1586 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001587 }
1588
Victor Stinner03490912011-10-03 23:45:12 +02001589 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001591 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001592 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001593 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1594 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001596 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597}
1598
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001599#ifdef Py_DEBUG
1600static int
1601unicode_is_singleton(PyObject *unicode)
1602{
1603 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1604 if (unicode == unicode_empty)
1605 return 1;
1606 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1607 {
1608 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1609 if (ch < 256 && unicode_latin1[ch] == unicode)
1610 return 1;
1611 }
1612 return 0;
1613}
1614#endif
1615
Alexander Belopolsky40018472011-02-26 01:02:56 +00001616static int
Victor Stinner488fa492011-12-12 00:01:39 +01001617unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618{
Victor Stinner488fa492011-12-12 00:01:39 +01001619 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (Py_REFCNT(unicode) != 1)
1621 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001622 if (_PyUnicode_HASH(unicode) != -1)
1623 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (PyUnicode_CHECK_INTERNED(unicode))
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (!PyUnicode_CheckExact(unicode))
1627 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001628#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001629 /* singleton refcount is greater than 1 */
1630 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001631#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001632 return 1;
1633}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635static int
1636unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1637{
1638 PyObject *unicode;
1639 Py_ssize_t old_length;
1640
1641 assert(p_unicode != NULL);
1642 unicode = *p_unicode;
1643
1644 assert(unicode != NULL);
1645 assert(PyUnicode_Check(unicode));
1646 assert(0 <= length);
1647
Victor Stinner910337b2011-10-03 03:20:16 +02001648 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001649 old_length = PyUnicode_WSTR_LENGTH(unicode);
1650 else
1651 old_length = PyUnicode_GET_LENGTH(unicode);
1652 if (old_length == length)
1653 return 0;
1654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001656 _Py_INCREF_UNICODE_EMPTY();
1657 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001658 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 Py_DECREF(*p_unicode);
1660 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 return 0;
1662 }
1663
Victor Stinner488fa492011-12-12 00:01:39 +01001664 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 PyObject *copy = resize_copy(unicode, length);
1666 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001667 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 Py_DECREF(*p_unicode);
1669 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
1672
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001674 PyObject *new_unicode = resize_compact(unicode, length);
1675 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001677 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001679 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001680 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001681}
1682
Alexander Belopolsky40018472011-02-26 01:02:56 +00001683int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001684PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001685{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686 PyObject *unicode;
1687 if (p_unicode == NULL) {
1688 PyErr_BadInternalCall();
1689 return -1;
1690 }
1691 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693 {
1694 PyErr_BadInternalCall();
1695 return -1;
1696 }
1697 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001698}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001699
Victor Stinnerc5166102012-02-22 13:55:02 +01001700/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001701
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001702 WARNING: The function doesn't copy the terminating null character and
1703 doesn't check the maximum character (may write a latin1 character in an
1704 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001705static void
1706unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1707 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001708{
1709 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1710 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001711 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001712
1713 switch (kind) {
1714 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001715 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001716#ifdef Py_DEBUG
1717 if (PyUnicode_IS_ASCII(unicode)) {
1718 Py_UCS4 maxchar = ucs1lib_find_max_char(
1719 (const Py_UCS1*)str,
1720 (const Py_UCS1*)str + len);
1721 assert(maxchar < 128);
1722 }
1723#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001724 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001725 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001726 }
1727 case PyUnicode_2BYTE_KIND: {
1728 Py_UCS2 *start = (Py_UCS2 *)data + index;
1729 Py_UCS2 *ucs2 = start;
1730 assert(index <= PyUnicode_GET_LENGTH(unicode));
1731
Victor Stinner184252a2012-06-16 02:57:41 +02001732 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 *ucs2 = (Py_UCS2)*str;
1734
1735 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001736 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 }
1738 default: {
1739 Py_UCS4 *start = (Py_UCS4 *)data + index;
1740 Py_UCS4 *ucs4 = start;
1741 assert(kind == PyUnicode_4BYTE_KIND);
1742 assert(index <= PyUnicode_GET_LENGTH(unicode));
1743
Victor Stinner184252a2012-06-16 02:57:41 +02001744 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001745 *ucs4 = (Py_UCS4)*str;
1746
1747 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001748 }
1749 }
1750}
1751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752static PyObject*
1753get_latin1_char(unsigned char ch)
1754{
Victor Stinnera464fc12011-10-02 20:39:30 +02001755 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode)
1759 return NULL;
1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 unicode_latin1[ch] = unicode;
1763 }
1764 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001765 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Victor Stinner985a82a2014-01-03 12:53:47 +01001768static PyObject*
1769unicode_char(Py_UCS4 ch)
1770{
1771 PyObject *unicode;
1772
1773 assert(ch <= MAX_UNICODE);
1774
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001775 if (ch < 256)
1776 return get_latin1_char(ch);
1777
Victor Stinner985a82a2014-01-03 12:53:47 +01001778 unicode = PyUnicode_New(1, ch);
1779 if (unicode == NULL)
1780 return NULL;
1781 switch (PyUnicode_KIND(unicode)) {
1782 case PyUnicode_1BYTE_KIND:
1783 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1784 break;
1785 case PyUnicode_2BYTE_KIND:
1786 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1787 break;
1788 default:
1789 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1790 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1791 }
1792 assert(_PyUnicode_CheckConsistency(unicode, 1));
1793 return unicode;
1794}
1795
Alexander Belopolsky40018472011-02-26 01:02:56 +00001796PyObject *
1797PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001799 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 Py_UCS4 maxchar = 0;
1801 Py_ssize_t num_surrogates;
1802
1803 if (u == NULL)
1804 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001806 /* If the Unicode data is known at construction time, we can apply
1807 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001810 if (size == 0)
1811 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Single character Unicode objects in the Latin-1 range are
1814 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001815 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 return get_latin1_char((unsigned char)*u);
1817
1818 /* If not empty and not single character, copy the Unicode data
1819 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 if (find_maxchar_surrogates(u, u + size,
1821 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 return NULL;
1823
Victor Stinner8faf8212011-12-08 22:14:11 +01001824 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 if (!unicode)
1826 return NULL;
1827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 switch (PyUnicode_KIND(unicode)) {
1829 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001830 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1832 break;
1833 case PyUnicode_2BYTE_KIND:
1834#if Py_UNICODE_SIZE == 2
1835 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1836#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001837 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1839#endif
1840 break;
1841 case PyUnicode_4BYTE_KIND:
1842#if SIZEOF_WCHAR_T == 2
1843 /* This is the only case which has to process surrogates, thus
1844 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001845 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846#else
1847 assert(num_surrogates == 0);
1848 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1849#endif
1850 break;
1851 default:
1852 assert(0 && "Impossible state");
1853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001855 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856}
1857
Alexander Belopolsky40018472011-02-26 01:02:56 +00001858PyObject *
1859PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001860{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001861 if (size < 0) {
1862 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001864 return NULL;
1865 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001866 if (u != NULL)
1867 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1868 else
1869 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001870}
1871
Alexander Belopolsky40018472011-02-26 01:02:56 +00001872PyObject *
1873PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001874{
1875 size_t size = strlen(u);
1876 if (size > PY_SSIZE_T_MAX) {
1877 PyErr_SetString(PyExc_OverflowError, "input too long");
1878 return NULL;
1879 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001880 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001881}
1882
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001883PyObject *
1884_PyUnicode_FromId(_Py_Identifier *id)
1885{
1886 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001887 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1888 strlen(id->string),
1889 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001890 if (!id->object)
1891 return NULL;
1892 PyUnicode_InternInPlace(&id->object);
1893 assert(!id->next);
1894 id->next = static_strings;
1895 static_strings = id;
1896 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001897 return id->object;
1898}
1899
1900void
1901_PyUnicode_ClearStaticStrings()
1902{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001903 _Py_Identifier *tmp, *s = static_strings;
1904 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001905 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001906 tmp = s->next;
1907 s->next = NULL;
1908 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001909 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001911}
1912
Benjamin Peterson0df54292012-03-26 14:50:32 -04001913/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914
Victor Stinnerd3f08822012-05-29 12:57:52 +02001915PyObject*
1916_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001917{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001918 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001919 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001920 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001921#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001922 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001923#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001924 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001925 }
Victor Stinner785938e2011-12-11 20:09:03 +01001926 unicode = PyUnicode_New(size, 127);
1927 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001928 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001929 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1930 assert(_PyUnicode_CheckConsistency(unicode, 1));
1931 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001932}
1933
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001934static Py_UCS4
1935kind_maxchar_limit(unsigned int kind)
1936{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001937 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001938 case PyUnicode_1BYTE_KIND:
1939 return 0x80;
1940 case PyUnicode_2BYTE_KIND:
1941 return 0x100;
1942 case PyUnicode_4BYTE_KIND:
1943 return 0x10000;
1944 default:
1945 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001946 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001947 }
1948}
1949
Victor Stinnere6abb482012-05-02 01:15:40 +02001950Py_LOCAL_INLINE(Py_UCS4)
1951align_maxchar(Py_UCS4 maxchar)
1952{
1953 if (maxchar <= 127)
1954 return 127;
1955 else if (maxchar <= 255)
1956 return 255;
1957 else if (maxchar <= 65535)
1958 return 65535;
1959 else
1960 return MAX_UNICODE;
1961}
1962
Victor Stinner702c7342011-10-05 13:50:52 +02001963static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001964_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001967 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001968
Serhiy Storchaka678db842013-01-26 12:16:36 +02001969 if (size == 0)
1970 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001972 if (size == 1)
1973 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001975 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001976 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 if (!res)
1978 return NULL;
1979 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001982}
1983
Victor Stinnere57b1c02011-09-28 22:20:48 +02001984static PyObject*
1985_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986{
1987 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001988 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989
Serhiy Storchaka678db842013-01-26 12:16:36 +02001990 if (size == 0)
1991 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001992 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001993 if (size == 1)
1994 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001995
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001996 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001997 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 if (!res)
1999 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002000 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002002 else {
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2005 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002006 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return res;
2008}
2009
Victor Stinnere57b1c02011-09-28 22:20:48 +02002010static PyObject*
2011_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012{
2013 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002014 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002015
Serhiy Storchaka678db842013-01-26 12:16:36 +02002016 if (size == 0)
2017 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002019 if (size == 1)
2020 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002021
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002022 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002023 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 if (!res)
2025 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002026 if (max_char < 256)
2027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2028 PyUnicode_1BYTE_DATA(res));
2029 else if (max_char < 0x10000)
2030 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2031 PyUnicode_2BYTE_DATA(res));
2032 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002034 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return res;
2036}
2037
2038PyObject*
2039PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2040{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002041 if (size < 0) {
2042 PyErr_SetString(PyExc_ValueError, "size must be positive");
2043 return NULL;
2044 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002045 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002047 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002052 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002053 PyErr_SetString(PyExc_SystemError, "invalid kind");
2054 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056}
2057
Victor Stinnerece58de2012-04-23 23:36:38 +02002058Py_UCS4
2059_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2060{
2061 enum PyUnicode_Kind kind;
2062 void *startptr, *endptr;
2063
2064 assert(PyUnicode_IS_READY(unicode));
2065 assert(0 <= start);
2066 assert(end <= PyUnicode_GET_LENGTH(unicode));
2067 assert(start <= end);
2068
2069 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2070 return PyUnicode_MAX_CHAR_VALUE(unicode);
2071
2072 if (start == end)
2073 return 127;
2074
Victor Stinner94d558b2012-04-27 22:26:58 +02002075 if (PyUnicode_IS_ASCII(unicode))
2076 return 127;
2077
Victor Stinnerece58de2012-04-23 23:36:38 +02002078 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002079 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002080 endptr = (char *)startptr + end * kind;
2081 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002082 switch(kind) {
2083 case PyUnicode_1BYTE_KIND:
2084 return ucs1lib_find_max_char(startptr, endptr);
2085 case PyUnicode_2BYTE_KIND:
2086 return ucs2lib_find_max_char(startptr, endptr);
2087 case PyUnicode_4BYTE_KIND:
2088 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002090 assert(0);
2091 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002092 }
2093}
2094
Victor Stinner25a4b292011-10-06 12:31:55 +02002095/* Ensure that a string uses the most efficient storage, if it is not the
2096 case: create a new string with of the right kind. Write NULL into *p_unicode
2097 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002098static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002099unicode_adjust_maxchar(PyObject **p_unicode)
2100{
2101 PyObject *unicode, *copy;
2102 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002103 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002104 unsigned int kind;
2105
2106 assert(p_unicode != NULL);
2107 unicode = *p_unicode;
2108 assert(PyUnicode_IS_READY(unicode));
2109 if (PyUnicode_IS_ASCII(unicode))
2110 return;
2111
2112 len = PyUnicode_GET_LENGTH(unicode);
2113 kind = PyUnicode_KIND(unicode);
2114 if (kind == PyUnicode_1BYTE_KIND) {
2115 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002116 max_char = ucs1lib_find_max_char(u, u + len);
2117 if (max_char >= 128)
2118 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 }
2120 else if (kind == PyUnicode_2BYTE_KIND) {
2121 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002122 max_char = ucs2lib_find_max_char(u, u + len);
2123 if (max_char >= 256)
2124 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002125 }
2126 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002127 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002128 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 max_char = ucs4lib_find_max_char(u, u + len);
2130 if (max_char >= 0x10000)
2131 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002134 if (copy != NULL)
2135 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 Py_DECREF(unicode);
2137 *p_unicode = copy;
2138}
2139
Victor Stinner034f6cf2011-09-30 02:26:44 +02002140PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002141_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142{
Victor Stinner87af4f22011-11-21 23:03:47 +01002143 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002144 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145
Victor Stinner034f6cf2011-09-30 02:26:44 +02002146 if (!PyUnicode_Check(unicode)) {
2147 PyErr_BadInternalCall();
2148 return NULL;
2149 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002150 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002151 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152
Victor Stinner87af4f22011-11-21 23:03:47 +01002153 length = PyUnicode_GET_LENGTH(unicode);
2154 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002155 if (!copy)
2156 return NULL;
2157 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2158
Victor Stinner87af4f22011-11-21 23:03:47 +01002159 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2160 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002161 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002162 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002163}
2164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165
Victor Stinnerbc603d12011-10-02 01:00:40 +02002166/* Widen Unicode objects to larger buffers. Don't write terminating null
2167 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168
2169void*
2170_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2171{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 Py_ssize_t len;
2173 void *result;
2174 unsigned int skind;
2175
Benjamin Petersonbac79492012-01-14 13:34:47 -05002176 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002177 return NULL;
2178
2179 len = PyUnicode_GET_LENGTH(s);
2180 skind = PyUnicode_KIND(s);
2181 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002182 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 return NULL;
2184 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002185 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 case PyUnicode_2BYTE_KIND:
2187 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2188 if (!result)
2189 return PyErr_NoMemory();
2190 assert(skind == PyUnicode_1BYTE_KIND);
2191 _PyUnicode_CONVERT_BYTES(
2192 Py_UCS1, Py_UCS2,
2193 PyUnicode_1BYTE_DATA(s),
2194 PyUnicode_1BYTE_DATA(s) + len,
2195 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002197 case PyUnicode_4BYTE_KIND:
2198 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2199 if (!result)
2200 return PyErr_NoMemory();
2201 if (skind == PyUnicode_2BYTE_KIND) {
2202 _PyUnicode_CONVERT_BYTES(
2203 Py_UCS2, Py_UCS4,
2204 PyUnicode_2BYTE_DATA(s),
2205 PyUnicode_2BYTE_DATA(s) + len,
2206 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002208 else {
2209 assert(skind == PyUnicode_1BYTE_KIND);
2210 _PyUnicode_CONVERT_BYTES(
2211 Py_UCS1, Py_UCS4,
2212 PyUnicode_1BYTE_DATA(s),
2213 PyUnicode_1BYTE_DATA(s) + len,
2214 result);
2215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002217 default:
2218 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 }
Victor Stinner01698042011-10-04 00:04:26 +02002220 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 return NULL;
2222}
2223
2224static Py_UCS4*
2225as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2226 int copy_null)
2227{
2228 int kind;
2229 void *data;
2230 Py_ssize_t len, targetlen;
2231 if (PyUnicode_READY(string) == -1)
2232 return NULL;
2233 kind = PyUnicode_KIND(string);
2234 data = PyUnicode_DATA(string);
2235 len = PyUnicode_GET_LENGTH(string);
2236 targetlen = len;
2237 if (copy_null)
2238 targetlen++;
2239 if (!target) {
2240 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2241 PyErr_NoMemory();
2242 return NULL;
2243 }
2244 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2245 if (!target) {
2246 PyErr_NoMemory();
2247 return NULL;
2248 }
2249 }
2250 else {
2251 if (targetsize < targetlen) {
2252 PyErr_Format(PyExc_SystemError,
2253 "string is longer than the buffer");
2254 if (copy_null && 0 < targetsize)
2255 target[0] = 0;
2256 return NULL;
2257 }
2258 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002259 if (kind == PyUnicode_1BYTE_KIND) {
2260 Py_UCS1 *start = (Py_UCS1 *) data;
2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002263 else if (kind == PyUnicode_2BYTE_KIND) {
2264 Py_UCS2 *start = (Py_UCS2 *) data;
2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266 }
2267 else {
2268 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 if (copy_null)
2272 target[len] = 0;
2273 return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278 int copy_null)
2279{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002280 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 PyErr_BadInternalCall();
2282 return NULL;
2283 }
2284 return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290 return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002294
Alexander Belopolsky40018472011-02-26 01:02:56 +00002295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002300 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 PyErr_BadInternalCall();
2302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 }
2304
Martin v. Löwis790465f2008-04-05 20:41:37 +00002305 if (size == -1) {
2306 size = wcslen(w);
2307 }
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002313
Walter Dörwald346737f2007-05-31 10:44:43 +00002314static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002315makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002316 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002317{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002318 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 if (longflag)
2320 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002321 else if (longlongflag) {
2322 /* longlongflag should only ever be nonzero on machines with
2323 HAVE_LONG_LONG defined */
2324#ifdef HAVE_LONG_LONG
2325 char *f = PY_FORMAT_LONG_LONG;
2326 while (*f)
2327 *fmt++ = *f++;
2328#else
2329 /* we shouldn't ever get here */
2330 assert(0);
2331 *fmt++ = 'l';
2332#endif
2333 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002334 else if (size_tflag) {
2335 char *f = PY_FORMAT_SIZE_T;
2336 while (*f)
2337 *fmt++ = *f++;
2338 }
2339 *fmt++ = c;
2340 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002341}
2342
Victor Stinner15a11362012-10-06 23:48:20 +02002343/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002344 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2345 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2346#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002347
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002348static int
2349unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2350 Py_ssize_t width, Py_ssize_t precision)
2351{
2352 Py_ssize_t length, fill, arglen;
2353 Py_UCS4 maxchar;
2354
2355 if (PyUnicode_READY(str) == -1)
2356 return -1;
2357
2358 length = PyUnicode_GET_LENGTH(str);
2359 if ((precision == -1 || precision >= length)
2360 && width <= length)
2361 return _PyUnicodeWriter_WriteStr(writer, str);
2362
2363 if (precision != -1)
2364 length = Py_MIN(precision, length);
2365
2366 arglen = Py_MAX(length, width);
2367 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2368 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2369 else
2370 maxchar = writer->maxchar;
2371
2372 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2373 return -1;
2374
2375 if (width > length) {
2376 fill = width - length;
2377 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2378 return -1;
2379 writer->pos += fill;
2380 }
2381
2382 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2383 str, 0, length);
2384 writer->pos += length;
2385 return 0;
2386}
2387
2388static int
2389unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2390 Py_ssize_t width, Py_ssize_t precision)
2391{
2392 /* UTF-8 */
2393 Py_ssize_t length;
2394 PyObject *unicode;
2395 int res;
2396
2397 length = strlen(str);
2398 if (precision != -1)
2399 length = Py_MIN(length, precision);
2400 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2401 if (unicode == NULL)
2402 return -1;
2403
2404 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2405 Py_DECREF(unicode);
2406 return res;
2407}
2408
Victor Stinner96865452011-03-01 23:44:09 +00002409static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002410unicode_fromformat_arg(_PyUnicodeWriter *writer,
2411 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002412{
Victor Stinnere215d962012-10-06 23:03:36 +02002413 const char *p;
2414 Py_ssize_t len;
2415 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 Py_ssize_t width;
2417 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002418 int longflag;
2419 int longlongflag;
2420 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002421 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002422
2423 p = f;
2424 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002425 zeropad = 0;
2426 if (*f == '0') {
2427 zeropad = 1;
2428 f++;
2429 }
Victor Stinner96865452011-03-01 23:44:09 +00002430
2431 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002432 width = -1;
2433 if (Py_ISDIGIT((unsigned)*f)) {
2434 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002435 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002436 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002437 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002438 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002440 return NULL;
2441 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002442 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002443 f++;
2444 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002445 }
2446 precision = -1;
2447 if (*f == '.') {
2448 f++;
2449 if (Py_ISDIGIT((unsigned)*f)) {
2450 precision = (*f - '0');
2451 f++;
2452 while (Py_ISDIGIT((unsigned)*f)) {
2453 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2454 PyErr_SetString(PyExc_ValueError,
2455 "precision too big");
2456 return NULL;
2457 }
2458 precision = (precision * 10) + (*f - '0');
2459 f++;
2460 }
2461 }
Victor Stinner96865452011-03-01 23:44:09 +00002462 if (*f == '%') {
2463 /* "%.3%s" => f points to "3" */
2464 f--;
2465 }
2466 }
2467 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002468 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002469 f--;
2470 }
Victor Stinner96865452011-03-01 23:44:09 +00002471
2472 /* Handle %ld, %lu, %lld and %llu. */
2473 longflag = 0;
2474 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002475 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002476 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002477 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002478 longflag = 1;
2479 ++f;
2480 }
2481#ifdef HAVE_LONG_LONG
2482 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002483 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002484 longlongflag = 1;
2485 f += 2;
2486 }
2487#endif
2488 }
2489 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002490 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002491 size_tflag = 1;
2492 ++f;
2493 }
Victor Stinnere215d962012-10-06 23:03:36 +02002494
2495 if (f[1] == '\0')
2496 writer->overallocate = 0;
2497
2498 switch (*f) {
2499 case 'c':
2500 {
2501 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002502 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002503 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002504 "character argument not in range(0x110000)");
2505 return NULL;
2506 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002507 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002508 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002509 break;
2510 }
2511
2512 case 'i':
2513 case 'd':
2514 case 'u':
2515 case 'x':
2516 {
2517 /* used by sprintf */
2518 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002519 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002520 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002521
2522 if (*f == 'u') {
2523 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2524
2525 if (longflag)
2526 len = sprintf(buffer, fmt,
2527 va_arg(*vargs, unsigned long));
2528#ifdef HAVE_LONG_LONG
2529 else if (longlongflag)
2530 len = sprintf(buffer, fmt,
2531 va_arg(*vargs, unsigned PY_LONG_LONG));
2532#endif
2533 else if (size_tflag)
2534 len = sprintf(buffer, fmt,
2535 va_arg(*vargs, size_t));
2536 else
2537 len = sprintf(buffer, fmt,
2538 va_arg(*vargs, unsigned int));
2539 }
2540 else if (*f == 'x') {
2541 makefmt(fmt, 0, 0, 0, 'x');
2542 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2543 }
2544 else {
2545 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2546
2547 if (longflag)
2548 len = sprintf(buffer, fmt,
2549 va_arg(*vargs, long));
2550#ifdef HAVE_LONG_LONG
2551 else if (longlongflag)
2552 len = sprintf(buffer, fmt,
2553 va_arg(*vargs, PY_LONG_LONG));
2554#endif
2555 else if (size_tflag)
2556 len = sprintf(buffer, fmt,
2557 va_arg(*vargs, Py_ssize_t));
2558 else
2559 len = sprintf(buffer, fmt,
2560 va_arg(*vargs, int));
2561 }
2562 assert(len >= 0);
2563
Victor Stinnere215d962012-10-06 23:03:36 +02002564 if (precision < len)
2565 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002566
2567 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002568 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2569 return NULL;
2570
Victor Stinnere215d962012-10-06 23:03:36 +02002571 if (width > precision) {
2572 Py_UCS4 fillchar;
2573 fill = width - precision;
2574 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002575 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2576 return NULL;
2577 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002578 }
Victor Stinner15a11362012-10-06 23:48:20 +02002579 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002580 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002581 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2582 return NULL;
2583 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002584 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585
Victor Stinner4a587072013-11-19 12:54:53 +01002586 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2587 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002588 break;
2589 }
2590
2591 case 'p':
2592 {
2593 char number[MAX_LONG_LONG_CHARS];
2594
2595 len = sprintf(number, "%p", va_arg(*vargs, void*));
2596 assert(len >= 0);
2597
2598 /* %p is ill-defined: ensure leading 0x. */
2599 if (number[1] == 'X')
2600 number[1] = 'x';
2601 else if (number[1] != 'x') {
2602 memmove(number + 2, number,
2603 strlen(number) + 1);
2604 number[0] = '0';
2605 number[1] = 'x';
2606 len += 2;
2607 }
2608
Victor Stinner4a587072013-11-19 12:54:53 +01002609 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
2611 break;
2612 }
2613
2614 case 's':
2615 {
2616 /* UTF-8 */
2617 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002618 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002619 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002620 break;
2621 }
2622
2623 case 'U':
2624 {
2625 PyObject *obj = va_arg(*vargs, PyObject *);
2626 assert(obj && _PyUnicode_CHECK(obj));
2627
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002629 return NULL;
2630 break;
2631 }
2632
2633 case 'V':
2634 {
2635 PyObject *obj = va_arg(*vargs, PyObject *);
2636 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002637 if (obj) {
2638 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002639 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002640 return NULL;
2641 }
2642 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002643 assert(str != NULL);
2644 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002645 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002646 }
2647 break;
2648 }
2649
2650 case 'S':
2651 {
2652 PyObject *obj = va_arg(*vargs, PyObject *);
2653 PyObject *str;
2654 assert(obj);
2655 str = PyObject_Str(obj);
2656 if (!str)
2657 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002658 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002659 Py_DECREF(str);
2660 return NULL;
2661 }
2662 Py_DECREF(str);
2663 break;
2664 }
2665
2666 case 'R':
2667 {
2668 PyObject *obj = va_arg(*vargs, PyObject *);
2669 PyObject *repr;
2670 assert(obj);
2671 repr = PyObject_Repr(obj);
2672 if (!repr)
2673 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002674 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002675 Py_DECREF(repr);
2676 return NULL;
2677 }
2678 Py_DECREF(repr);
2679 break;
2680 }
2681
2682 case 'A':
2683 {
2684 PyObject *obj = va_arg(*vargs, PyObject *);
2685 PyObject *ascii;
2686 assert(obj);
2687 ascii = PyObject_ASCII(obj);
2688 if (!ascii)
2689 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002690 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002691 Py_DECREF(ascii);
2692 return NULL;
2693 }
2694 Py_DECREF(ascii);
2695 break;
2696 }
2697
2698 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002699 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002700 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002701 break;
2702
2703 default:
2704 /* if we stumble upon an unknown formatting code, copy the rest
2705 of the format string to the output string. (we cannot just
2706 skip the code, since there's no way to know what's in the
2707 argument list) */
2708 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002709 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002710 return NULL;
2711 f = p+len;
2712 return f;
2713 }
2714
2715 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002716 return f;
2717}
2718
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719PyObject *
2720PyUnicode_FromFormatV(const char *format, va_list vargs)
2721{
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_list vargs2;
2723 const char *f;
2724 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002725
Victor Stinner8f674cc2013-04-17 23:02:17 +02002726 _PyUnicodeWriter_Init(&writer);
2727 writer.min_length = strlen(format) + 100;
2728 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002729
2730 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2731 Copy it to be able to pass a reference to a subfunction. */
2732 Py_VA_COPY(vargs2, vargs);
2733
2734 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002736 f = unicode_fromformat_arg(&writer, f, &vargs2);
2737 if (f == NULL)
2738 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002741 const char *p;
2742 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002743
Victor Stinnere215d962012-10-06 23:03:36 +02002744 p = f;
2745 do
2746 {
2747 if ((unsigned char)*p > 127) {
2748 PyErr_Format(PyExc_ValueError,
2749 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2750 "string, got a non-ASCII byte: 0x%02x",
2751 (unsigned char)*p);
2752 return NULL;
2753 }
2754 p++;
2755 }
2756 while (*p != '\0' && *p != '%');
2757 len = p - f;
2758
2759 if (*p == '\0')
2760 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002761
2762 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002763 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002764
2765 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002767 }
Victor Stinnere215d962012-10-06 23:03:36 +02002768 return _PyUnicodeWriter_Finish(&writer);
2769
2770 fail:
2771 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002773}
2774
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775PyObject *
2776PyUnicode_FromFormat(const char *format, ...)
2777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 PyObject* ret;
2779 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002780
2781#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002783#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 ret = PyUnicode_FromFormatV(format, vargs);
2787 va_end(vargs);
2788 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002789}
2790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791#ifdef HAVE_WCHAR_H
2792
Victor Stinner5593d8a2010-10-02 11:11:27 +00002793/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2794 convert a Unicode object to a wide character string.
2795
Victor Stinnerd88d9832011-09-06 02:00:05 +02002796 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002797 character) required to convert the unicode object. Ignore size argument.
2798
Victor Stinnerd88d9832011-09-06 02:00:05 +02002799 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002800 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002803unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002804 wchar_t *w,
2805 Py_ssize_t size)
2806{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002807 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 const wchar_t *wstr;
2809
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002810 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (wstr == NULL)
2812 return -1;
2813
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002815 if (size > res)
2816 size = res + 1;
2817 else
2818 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002820 return res;
2821 }
2822 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002824}
2825
2826Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002827PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002828 wchar_t *w,
2829 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830{
2831 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 PyErr_BadInternalCall();
2833 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002835 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836}
2837
Victor Stinner137c34c2010-09-29 10:25:54 +00002838wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002839PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002840 Py_ssize_t *size)
2841{
2842 wchar_t* buffer;
2843 Py_ssize_t buflen;
2844
2845 if (unicode == NULL) {
2846 PyErr_BadInternalCall();
2847 return NULL;
2848 }
2849
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002850 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002851 if (buflen == -1)
2852 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002853 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002854 PyErr_NoMemory();
2855 return NULL;
2856 }
2857
Victor Stinner137c34c2010-09-29 10:25:54 +00002858 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2859 if (buffer == NULL) {
2860 PyErr_NoMemory();
2861 return NULL;
2862 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002863 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002864 if (buflen == -1) {
2865 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002866 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002867 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002868 if (size != NULL)
2869 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002870 return buffer;
2871}
2872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002873#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874
Alexander Belopolsky40018472011-02-26 01:02:56 +00002875PyObject *
2876PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002877{
Victor Stinner8faf8212011-12-08 22:14:11 +01002878 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002879 PyErr_SetString(PyExc_ValueError,
2880 "chr() arg not in range(0x110000)");
2881 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002882 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002883
Victor Stinner985a82a2014-01-03 12:53:47 +01002884 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002885}
2886
Alexander Belopolsky40018472011-02-26 01:02:56 +00002887PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002888PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002890 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002893 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002894 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002895 Py_INCREF(obj);
2896 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897 }
2898 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 /* For a Unicode subtype that's not a Unicode object,
2900 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002901 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002902 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002903 PyErr_Format(PyExc_TypeError,
2904 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002905 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002906 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002907}
2908
Alexander Belopolsky40018472011-02-26 01:02:56 +00002909PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002910PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002911 const char *encoding,
2912 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002913{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002914 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002915 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002916
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 PyErr_BadInternalCall();
2919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002921
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002922 /* Decoding bytes objects is the most common case and should be fast */
2923 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002924 if (PyBytes_GET_SIZE(obj) == 0)
2925 _Py_RETURN_UNICODE_EMPTY();
2926 v = PyUnicode_Decode(
2927 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2928 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002929 return v;
2930 }
2931
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002932 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 PyErr_SetString(PyExc_TypeError,
2934 "decoding str is not supported");
2935 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2939 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2940 PyErr_Format(PyExc_TypeError,
2941 "coercing to str: need bytes, bytearray "
2942 "or buffer-like object, %.80s found",
2943 Py_TYPE(obj)->tp_name);
2944 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002945 }
Tim Petersced69f82003-09-16 20:30:58 +00002946
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002947 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002948 PyBuffer_Release(&buffer);
2949 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002951
Serhiy Storchaka05997252013-01-26 12:14:02 +02002952 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002953 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002954 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955}
2956
Victor Stinner600d3be2010-06-10 12:00:55 +00002957/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002958 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2959 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002960int
2961_Py_normalize_encoding(const char *encoding,
2962 char *lower,
2963 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002965 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002966 char *l;
2967 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002969 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002970 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002971 if (lower_len < 6)
2972 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002973 strcpy(lower, "utf-8");
2974 return 1;
2975 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002976 e = encoding;
2977 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002978 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002979 while (*e) {
2980 if (l == l_end)
2981 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002982 if (Py_ISUPPER(*e)) {
2983 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002984 }
2985 else if (*e == '_') {
2986 *l++ = '-';
2987 e++;
2988 }
2989 else {
2990 *l++ = *e++;
2991 }
2992 }
2993 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002994 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002995}
2996
Alexander Belopolsky40018472011-02-26 01:02:56 +00002997PyObject *
2998PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002999 Py_ssize_t size,
3000 const char *encoding,
3001 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003002{
3003 PyObject *buffer = NULL, *unicode;
3004 Py_buffer info;
3005 char lower[11]; /* Enough for any encoding shortcut */
3006
Fred Drakee4315f52000-05-09 19:53:39 +00003007 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003008 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003009 if ((strcmp(lower, "utf-8") == 0) ||
3010 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003011 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003012 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003013 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003014 (strcmp(lower, "iso-8859-1") == 0) ||
3015 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003016 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003017#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003018 else if (strcmp(lower, "mbcs") == 0)
3019 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003020#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003021 else if (strcmp(lower, "ascii") == 0)
3022 return PyUnicode_DecodeASCII(s, size, errors);
3023 else if (strcmp(lower, "utf-16") == 0)
3024 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3025 else if (strcmp(lower, "utf-32") == 0)
3026 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003030 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003031 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003032 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003033 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 if (buffer == NULL)
3035 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003036 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 if (unicode == NULL)
3038 goto onError;
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003041 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3042 "use codecs.decode() to decode to arbitrary types",
3043 encoding,
3044 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 Py_DECREF(unicode);
3046 goto onError;
3047 }
3048 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003049 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003050
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 Py_XDECREF(buffer);
3053 return NULL;
3054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
3057PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003060{
3061 PyObject *v;
3062
3063 if (!PyUnicode_Check(unicode)) {
3064 PyErr_BadArgument();
3065 goto onError;
3066 }
3067
3068 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003070
3071 /* Decode via the codec registry */
3072 v = PyCodec_Decode(unicode, encoding, errors);
3073 if (v == NULL)
3074 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003075 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003078 return NULL;
3079}
3080
Alexander Belopolsky40018472011-02-26 01:02:56 +00003081PyObject *
3082PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003083 const char *encoding,
3084 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003085{
3086 PyObject *v;
3087
3088 if (!PyUnicode_Check(unicode)) {
3089 PyErr_BadArgument();
3090 goto onError;
3091 }
3092
3093 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003095
3096 /* Decode via the codec registry */
3097 v = PyCodec_Decode(unicode, encoding, errors);
3098 if (v == NULL)
3099 goto onError;
3100 if (!PyUnicode_Check(v)) {
3101 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003102 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3103 "use codecs.decode() to decode to arbitrary types",
3104 encoding,
3105 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106 Py_DECREF(v);
3107 goto onError;
3108 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003109 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003112 return NULL;
3113}
3114
Alexander Belopolsky40018472011-02-26 01:02:56 +00003115PyObject *
3116PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003117 Py_ssize_t size,
3118 const char *encoding,
3119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
3121 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003122
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 unicode = PyUnicode_FromUnicode(s, size);
3124 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3127 Py_DECREF(unicode);
3128 return v;
3129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
3132PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003133 const char *encoding,
3134 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003135{
3136 PyObject *v;
3137
3138 if (!PyUnicode_Check(unicode)) {
3139 PyErr_BadArgument();
3140 goto onError;
3141 }
3142
3143 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003145
3146 /* Encode via the codec registry */
3147 v = PyCodec_Encode(unicode, encoding, errors);
3148 if (v == NULL)
3149 goto onError;
3150 return v;
3151
Benjamin Peterson29060642009-01-31 22:14:21 +00003152 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003153 return NULL;
3154}
3155
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156static size_t
3157wcstombs_errorpos(const wchar_t *wstr)
3158{
3159 size_t len;
3160#if SIZEOF_WCHAR_T == 2
3161 wchar_t buf[3];
3162#else
3163 wchar_t buf[2];
3164#endif
3165 char outbuf[MB_LEN_MAX];
3166 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003167
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168#if SIZEOF_WCHAR_T == 2
3169 buf[2] = 0;
3170#else
3171 buf[1] = 0;
3172#endif
3173 start = wstr;
3174 while (*wstr != L'\0')
3175 {
3176 previous = wstr;
3177#if SIZEOF_WCHAR_T == 2
3178 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3179 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3180 {
3181 buf[0] = wstr[0];
3182 buf[1] = wstr[1];
3183 wstr += 2;
3184 }
3185 else {
3186 buf[0] = *wstr;
3187 buf[1] = 0;
3188 wstr++;
3189 }
3190#else
3191 buf[0] = *wstr;
3192 wstr++;
3193#endif
3194 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003195 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003196 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003197 }
3198
3199 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003200 return 0;
3201}
3202
Victor Stinner1b579672011-12-17 05:47:23 +01003203static int
3204locale_error_handler(const char *errors, int *surrogateescape)
3205{
3206 if (errors == NULL) {
3207 *surrogateescape = 0;
3208 return 0;
3209 }
3210
3211 if (strcmp(errors, "strict") == 0) {
3212 *surrogateescape = 0;
3213 return 0;
3214 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003215 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003216 *surrogateescape = 1;
3217 return 0;
3218 }
3219 PyErr_Format(PyExc_ValueError,
3220 "only 'strict' and 'surrogateescape' error handlers "
3221 "are supported, not '%s'",
3222 errors);
3223 return -1;
3224}
3225
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003226PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003227PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003228{
3229 Py_ssize_t wlen, wlen2;
3230 wchar_t *wstr;
3231 PyObject *bytes = NULL;
3232 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003233 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234 PyObject *exc;
3235 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003236 int surrogateescape;
3237
3238 if (locale_error_handler(errors, &surrogateescape) < 0)
3239 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240
3241 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3242 if (wstr == NULL)
3243 return NULL;
3244
3245 wlen2 = wcslen(wstr);
3246 if (wlen2 != wlen) {
3247 PyMem_Free(wstr);
3248 PyErr_SetString(PyExc_TypeError, "embedded null character");
3249 return NULL;
3250 }
3251
3252 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003253 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003254 char *str;
3255
3256 str = _Py_wchar2char(wstr, &error_pos);
3257 if (str == NULL) {
3258 if (error_pos == (size_t)-1) {
3259 PyErr_NoMemory();
3260 PyMem_Free(wstr);
3261 return NULL;
3262 }
3263 else {
3264 goto encode_error;
3265 }
3266 }
3267 PyMem_Free(wstr);
3268
3269 bytes = PyBytes_FromString(str);
3270 PyMem_Free(str);
3271 }
3272 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003273 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003274 size_t len, len2;
3275
3276 len = wcstombs(NULL, wstr, 0);
3277 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003278 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279 goto encode_error;
3280 }
3281
3282 bytes = PyBytes_FromStringAndSize(NULL, len);
3283 if (bytes == NULL) {
3284 PyMem_Free(wstr);
3285 return NULL;
3286 }
3287
3288 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3289 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003290 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003291 goto encode_error;
3292 }
3293 PyMem_Free(wstr);
3294 }
3295 return bytes;
3296
3297encode_error:
3298 errmsg = strerror(errno);
3299 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003300
3301 if (error_pos == (size_t)-1)
3302 error_pos = wcstombs_errorpos(wstr);
3303
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003304 PyMem_Free(wstr);
3305 Py_XDECREF(bytes);
3306
Victor Stinner2f197072011-12-17 07:08:30 +01003307 if (errmsg != NULL) {
3308 size_t errlen;
3309 wstr = _Py_char2wchar(errmsg, &errlen);
3310 if (wstr != NULL) {
3311 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003312 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003313 } else
3314 errmsg = NULL;
3315 }
3316 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003317 reason = PyUnicode_FromString(
3318 "wcstombs() encountered an unencodable "
3319 "wide character");
3320 if (reason == NULL)
3321 return NULL;
3322
3323 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3324 "locale", unicode,
3325 (Py_ssize_t)error_pos,
3326 (Py_ssize_t)(error_pos+1),
3327 reason);
3328 Py_DECREF(reason);
3329 if (exc != NULL) {
3330 PyCodec_StrictErrors(exc);
3331 Py_XDECREF(exc);
3332 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003333 return NULL;
3334}
3335
Victor Stinnerad158722010-10-27 00:25:46 +00003336PyObject *
3337PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003338{
Victor Stinner99b95382011-07-04 14:23:54 +02003339#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003340 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003341#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003343#else
Victor Stinner793b5312011-04-27 00:24:21 +02003344 PyInterpreterState *interp = PyThreadState_GET()->interp;
3345 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3346 cannot use it to encode and decode filenames before it is loaded. Load
3347 the Python codec requires to encode at least its own filename. Use the C
3348 version of the locale codec until the codec registry is initialized and
3349 the Python codec is loaded.
3350
3351 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3352 cannot only rely on it: check also interp->fscodec_initialized for
3353 subinterpreters. */
3354 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003355 return PyUnicode_AsEncodedString(unicode,
3356 Py_FileSystemDefaultEncoding,
3357 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003358 }
3359 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003360 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003361 }
Victor Stinnerad158722010-10-27 00:25:46 +00003362#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003363}
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 const char *encoding,
3368 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369{
3370 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003371 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003372
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 if (!PyUnicode_Check(unicode)) {
3374 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 }
Fred Drakee4315f52000-05-09 19:53:39 +00003377
Fred Drakee4315f52000-05-09 19:53:39 +00003378 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003379 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003380 if ((strcmp(lower, "utf-8") == 0) ||
3381 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003382 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003383 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003384 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003385 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003387 }
Victor Stinner37296e82010-06-10 13:36:23 +00003388 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003389 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003390 (strcmp(lower, "iso-8859-1") == 0) ||
3391 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003393#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003394 else if (strcmp(lower, "mbcs") == 0)
3395 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003396#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003397 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400
3401 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003402 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003404 return NULL;
3405
3406 /* The normal path */
3407 if (PyBytes_Check(v))
3408 return v;
3409
3410 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003411 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003412 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003413 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003414
3415 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003416 "encoder %s returned bytearray instead of bytes; "
3417 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003418 encoding);
3419 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003420 Py_DECREF(v);
3421 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003423
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003424 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3425 Py_DECREF(v);
3426 return b;
3427 }
3428
3429 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003430 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3431 "use codecs.encode() to encode to arbitrary types",
3432 encoding,
3433 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003434 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003435 return NULL;
3436}
3437
Alexander Belopolsky40018472011-02-26 01:02:56 +00003438PyObject *
3439PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003440 const char *encoding,
3441 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003442{
3443 PyObject *v;
3444
3445 if (!PyUnicode_Check(unicode)) {
3446 PyErr_BadArgument();
3447 goto onError;
3448 }
3449
3450 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003451 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003452
3453 /* Encode via the codec registry */
3454 v = PyCodec_Encode(unicode, encoding, errors);
3455 if (v == NULL)
3456 goto onError;
3457 if (!PyUnicode_Check(v)) {
3458 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003459 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3460 "use codecs.encode() to encode to arbitrary types",
3461 encoding,
3462 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003463 Py_DECREF(v);
3464 goto onError;
3465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003467
Benjamin Peterson29060642009-01-31 22:14:21 +00003468 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 return NULL;
3470}
3471
Victor Stinner2f197072011-12-17 07:08:30 +01003472static size_t
3473mbstowcs_errorpos(const char *str, size_t len)
3474{
3475#ifdef HAVE_MBRTOWC
3476 const char *start = str;
3477 mbstate_t mbs;
3478 size_t converted;
3479 wchar_t ch;
3480
3481 memset(&mbs, 0, sizeof mbs);
3482 while (len)
3483 {
3484 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3485 if (converted == 0)
3486 /* Reached end of string */
3487 break;
3488 if (converted == (size_t)-1 || converted == (size_t)-2) {
3489 /* Conversion error or incomplete character */
3490 return str - start;
3491 }
3492 else {
3493 str += converted;
3494 len -= converted;
3495 }
3496 }
3497 /* failed to find the undecodable byte sequence */
3498 return 0;
3499#endif
3500 return 0;
3501}
3502
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003503PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003504PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003505 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506{
3507 wchar_t smallbuf[256];
3508 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3509 wchar_t *wstr;
3510 size_t wlen, wlen2;
3511 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003512 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003513 size_t error_pos;
3514 char *errmsg;
3515 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003516
3517 if (locale_error_handler(errors, &surrogateescape) < 0)
3518 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003519
3520 if (str[len] != '\0' || len != strlen(str)) {
3521 PyErr_SetString(PyExc_TypeError, "embedded null character");
3522 return NULL;
3523 }
3524
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003525 if (surrogateescape) {
3526 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003527 wstr = _Py_char2wchar(str, &wlen);
3528 if (wstr == NULL) {
3529 if (wlen == (size_t)-1)
3530 PyErr_NoMemory();
3531 else
3532 PyErr_SetFromErrno(PyExc_OSError);
3533 return NULL;
3534 }
3535
3536 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003537 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003538 }
3539 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003540 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003541#ifndef HAVE_BROKEN_MBSTOWCS
3542 wlen = mbstowcs(NULL, str, 0);
3543#else
3544 wlen = len;
3545#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003546 if (wlen == (size_t)-1)
3547 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003548 if (wlen+1 <= smallbuf_len) {
3549 wstr = smallbuf;
3550 }
3551 else {
3552 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3553 return PyErr_NoMemory();
3554
3555 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3556 if (!wstr)
3557 return PyErr_NoMemory();
3558 }
3559
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003560 wlen2 = mbstowcs(wstr, str, wlen+1);
3561 if (wlen2 == (size_t)-1) {
3562 if (wstr != smallbuf)
3563 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003564 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003565 }
3566#ifdef HAVE_BROKEN_MBSTOWCS
3567 assert(wlen2 == wlen);
3568#endif
3569 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3570 if (wstr != smallbuf)
3571 PyMem_Free(wstr);
3572 }
3573 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003574
3575decode_error:
3576 errmsg = strerror(errno);
3577 assert(errmsg != NULL);
3578
3579 error_pos = mbstowcs_errorpos(str, len);
3580 if (errmsg != NULL) {
3581 size_t errlen;
3582 wstr = _Py_char2wchar(errmsg, &errlen);
3583 if (wstr != NULL) {
3584 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003585 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003586 } else
3587 errmsg = NULL;
3588 }
3589 if (errmsg == NULL)
3590 reason = PyUnicode_FromString(
3591 "mbstowcs() encountered an invalid multibyte sequence");
3592 if (reason == NULL)
3593 return NULL;
3594
3595 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3596 "locale", str, len,
3597 (Py_ssize_t)error_pos,
3598 (Py_ssize_t)(error_pos+1),
3599 reason);
3600 Py_DECREF(reason);
3601 if (exc != NULL) {
3602 PyCodec_StrictErrors(exc);
3603 Py_XDECREF(exc);
3604 }
3605 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003606}
3607
3608PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003609PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003610{
3611 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003612 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003613}
3614
3615
3616PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003617PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003618 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003619 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3620}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003621
Christian Heimes5894ba72007-11-04 11:43:14 +00003622PyObject*
3623PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3624{
Victor Stinner99b95382011-07-04 14:23:54 +02003625#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003626 return PyUnicode_DecodeMBCS(s, size, NULL);
3627#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003628 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003629#else
Victor Stinner793b5312011-04-27 00:24:21 +02003630 PyInterpreterState *interp = PyThreadState_GET()->interp;
3631 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3632 cannot use it to encode and decode filenames before it is loaded. Load
3633 the Python codec requires to encode at least its own filename. Use the C
3634 version of the locale codec until the codec registry is initialized and
3635 the Python codec is loaded.
3636
3637 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3638 cannot only rely on it: check also interp->fscodec_initialized for
3639 subinterpreters. */
3640 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003641 return PyUnicode_Decode(s, size,
3642 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003643 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003644 }
3645 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003646 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003647 }
Victor Stinnerad158722010-10-27 00:25:46 +00003648#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003649}
3650
Martin v. Löwis011e8422009-05-05 04:43:17 +00003651
3652int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003653_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003654{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003655 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003656
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003657 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003658 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003659 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3660 PyUnicode_GET_LENGTH(str), '\0', 1);
3661 if (pos == -1)
3662 return 0;
3663 else
3664 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003665}
3666
Antoine Pitrou13348842012-01-29 18:36:34 +01003667int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003668PyUnicode_FSConverter(PyObject* arg, void* addr)
3669{
3670 PyObject *output = NULL;
3671 Py_ssize_t size;
3672 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003673 if (arg == NULL) {
3674 Py_DECREF(*(PyObject**)addr);
3675 return 1;
3676 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003677 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003678 output = arg;
3679 Py_INCREF(output);
3680 }
3681 else {
3682 arg = PyUnicode_FromObject(arg);
3683 if (!arg)
3684 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003685 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003686 Py_DECREF(arg);
3687 if (!output)
3688 return 0;
3689 if (!PyBytes_Check(output)) {
3690 Py_DECREF(output);
3691 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3692 return 0;
3693 }
3694 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003695 size = PyBytes_GET_SIZE(output);
3696 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003697 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003698 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003699 Py_DECREF(output);
3700 return 0;
3701 }
3702 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003703 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003704}
3705
3706
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003707int
3708PyUnicode_FSDecoder(PyObject* arg, void* addr)
3709{
3710 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003711 if (arg == NULL) {
3712 Py_DECREF(*(PyObject**)addr);
3713 return 1;
3714 }
3715 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003716 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003718 output = arg;
3719 Py_INCREF(output);
3720 }
3721 else {
3722 arg = PyBytes_FromObject(arg);
3723 if (!arg)
3724 return 0;
3725 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3726 PyBytes_GET_SIZE(arg));
3727 Py_DECREF(arg);
3728 if (!output)
3729 return 0;
3730 if (!PyUnicode_Check(output)) {
3731 Py_DECREF(output);
3732 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3733 return 0;
3734 }
3735 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003736 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003737 Py_DECREF(output);
3738 return 0;
3739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003741 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003742 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3743 Py_DECREF(output);
3744 return 0;
3745 }
3746 *(PyObject**)addr = output;
3747 return Py_CLEANUP_SUPPORTED;
3748}
3749
3750
Martin v. Löwis5b222132007-06-10 09:51:05 +00003751char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003753{
Christian Heimesf3863112007-11-22 07:46:41 +00003754 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 return NULL;
3759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003761 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003763 if (PyUnicode_UTF8(unicode) == NULL) {
3764 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3766 if (bytes == NULL)
3767 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3769 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003770 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 Py_DECREF(bytes);
3772 return NULL;
3773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3775 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3776 PyBytes_AS_STRING(bytes),
3777 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778 Py_DECREF(bytes);
3779 }
3780
3781 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003782 *psize = PyUnicode_UTF8_LENGTH(unicode);
3783 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003784}
3785
3786char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3790}
3791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003792Py_UNICODE *
3793PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 const unsigned char *one_byte;
3796#if SIZEOF_WCHAR_T == 4
3797 const Py_UCS2 *two_bytes;
3798#else
3799 const Py_UCS4 *four_bytes;
3800 const Py_UCS4 *ucs4_end;
3801 Py_ssize_t num_surrogates;
3802#endif
3803 wchar_t *w;
3804 wchar_t *wchar_end;
3805
3806 if (!PyUnicode_Check(unicode)) {
3807 PyErr_BadArgument();
3808 return NULL;
3809 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003812 assert(_PyUnicode_KIND(unicode) != 0);
3813 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3818 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819 num_surrogates = 0;
3820
3821 for (; four_bytes < ucs4_end; ++four_bytes) {
3822 if (*four_bytes > 0xFFFF)
3823 ++num_surrogates;
3824 }
3825
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3827 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3828 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 PyErr_NoMemory();
3830 return NULL;
3831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 w = _PyUnicode_WSTR(unicode);
3835 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3836 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3838 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003839 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003841 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3842 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 }
3844 else
3845 *w = *four_bytes;
3846
3847 if (w > wchar_end) {
3848 assert(0 && "Miscalculated string end");
3849 }
3850 }
3851 *w = 0;
3852#else
3853 /* sizeof(wchar_t) == 4 */
3854 Py_FatalError("Impossible unicode object state, wstr and str "
3855 "should share memory already.");
3856 return NULL;
3857#endif
3858 }
3859 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3861 (_PyUnicode_LENGTH(unicode) + 1));
3862 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 PyErr_NoMemory();
3864 return NULL;
3865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3867 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3868 w = _PyUnicode_WSTR(unicode);
3869 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003871 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3872 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 for (; w < wchar_end; ++one_byte, ++w)
3874 *w = *one_byte;
3875 /* null-terminate the wstr */
3876 *w = 0;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 for (; w < wchar_end; ++two_bytes, ++w)
3882 *w = *two_bytes;
3883 /* null-terminate the wstr */
3884 *w = 0;
3885#else
3886 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 PyObject_FREE(_PyUnicode_WSTR(unicode));
3888 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 Py_FatalError("Impossible unicode object state, wstr "
3890 "and str should share memory already.");
3891 return NULL;
3892#endif
3893 }
3894 else {
3895 assert(0 && "This should never happen.");
3896 }
3897 }
3898 }
3899 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 *size = PyUnicode_WSTR_LENGTH(unicode);
3901 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003902}
3903
Alexander Belopolsky40018472011-02-26 01:02:56 +00003904Py_UNICODE *
3905PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908}
3909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910
Alexander Belopolsky40018472011-02-26 01:02:56 +00003911Py_ssize_t
3912PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913{
3914 if (!PyUnicode_Check(unicode)) {
3915 PyErr_BadArgument();
3916 goto onError;
3917 }
3918 return PyUnicode_GET_SIZE(unicode);
3919
Benjamin Peterson29060642009-01-31 22:14:21 +00003920 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 return -1;
3922}
3923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924Py_ssize_t
3925PyUnicode_GetLength(PyObject *unicode)
3926{
Victor Stinner07621332012-06-16 04:53:46 +02003927 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928 PyErr_BadArgument();
3929 return -1;
3930 }
Victor Stinner07621332012-06-16 04:53:46 +02003931 if (PyUnicode_READY(unicode) == -1)
3932 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 return PyUnicode_GET_LENGTH(unicode);
3934}
3935
3936Py_UCS4
3937PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3938{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003939 void *data;
3940 int kind;
3941
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003942 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3943 PyErr_BadArgument();
3944 return (Py_UCS4)-1;
3945 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003946 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003947 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 return (Py_UCS4)-1;
3949 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003950 data = PyUnicode_DATA(unicode);
3951 kind = PyUnicode_KIND(unicode);
3952 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953}
3954
3955int
3956PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3957{
3958 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003959 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 return -1;
3961 }
Victor Stinner488fa492011-12-12 00:01:39 +01003962 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003963 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003964 PyErr_SetString(PyExc_IndexError, "string index out of range");
3965 return -1;
3966 }
Victor Stinner488fa492011-12-12 00:01:39 +01003967 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003968 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003969 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3970 PyErr_SetString(PyExc_ValueError, "character out of range");
3971 return -1;
3972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3974 index, ch);
3975 return 0;
3976}
3977
Alexander Belopolsky40018472011-02-26 01:02:56 +00003978const char *
3979PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003980{
Victor Stinner42cb4622010-09-01 19:39:01 +00003981 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003982}
3983
Victor Stinner554f3f02010-06-16 23:33:54 +00003984/* create or adjust a UnicodeDecodeError */
3985static void
3986make_decode_exception(PyObject **exceptionObject,
3987 const char *encoding,
3988 const char *input, Py_ssize_t length,
3989 Py_ssize_t startpos, Py_ssize_t endpos,
3990 const char *reason)
3991{
3992 if (*exceptionObject == NULL) {
3993 *exceptionObject = PyUnicodeDecodeError_Create(
3994 encoding, input, length, startpos, endpos, reason);
3995 }
3996 else {
3997 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3998 goto onError;
3999 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4000 goto onError;
4001 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4002 goto onError;
4003 }
4004 return;
4005
4006onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004007 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004008}
4009
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004010#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011/* error handling callback helper:
4012 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004013 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 and adjust various state variables.
4015 return 0 on success, -1 on error
4016*/
4017
Alexander Belopolsky40018472011-02-26 01:02:56 +00004018static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004019unicode_decode_call_errorhandler_wchar(
4020 const char *errors, PyObject **errorHandler,
4021 const char *encoding, const char *reason,
4022 const char **input, const char **inend, Py_ssize_t *startinpos,
4023 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4024 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004026 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027
4028 PyObject *restuple = NULL;
4029 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004030 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004031 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004032 Py_ssize_t requiredsize;
4033 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004034 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004035 wchar_t *repwstr;
4036 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004038 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4039 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 *errorHandler = PyCodec_LookupError(errors);
4043 if (*errorHandler == NULL)
4044 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 }
4046
Victor Stinner554f3f02010-06-16 23:33:54 +00004047 make_decode_exception(exceptionObject,
4048 encoding,
4049 *input, *inend - *input,
4050 *startinpos, *endinpos,
4051 reason);
4052 if (*exceptionObject == NULL)
4053 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
4055 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4056 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004059 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 }
4062 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004064
4065 /* Copy back the bytes variables, which might have been modified by the
4066 callback */
4067 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4068 if (!inputobj)
4069 goto onError;
4070 if (!PyBytes_Check(inputobj)) {
4071 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4072 }
4073 *input = PyBytes_AS_STRING(inputobj);
4074 insize = PyBytes_GET_SIZE(inputobj);
4075 *inend = *input + insize;
4076 /* we can DECREF safely, as the exception has another reference,
4077 so the object won't go away. */
4078 Py_DECREF(inputobj);
4079
4080 if (newpos<0)
4081 newpos = insize+newpos;
4082 if (newpos<0 || newpos>insize) {
4083 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4084 goto onError;
4085 }
4086
4087 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4088 if (repwstr == NULL)
4089 goto onError;
4090 /* need more space? (at least enough for what we
4091 have+the replacement+the rest of the string (starting
4092 at the new input position), so we won't have to check space
4093 when there are no errors in the rest of the string) */
4094 requiredsize = *outpos + repwlen + insize-newpos;
4095 if (requiredsize > outsize) {
4096 if (requiredsize < 2*outsize)
4097 requiredsize = 2*outsize;
4098 if (unicode_resize(output, requiredsize) < 0)
4099 goto onError;
4100 }
4101 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4102 *outpos += repwlen;
4103
4104 *endinpos = newpos;
4105 *inptr = *input + newpos;
4106
4107 /* we made it! */
4108 Py_XDECREF(restuple);
4109 return 0;
4110
4111 onError:
4112 Py_XDECREF(restuple);
4113 return -1;
4114}
4115#endif /* HAVE_MBCS */
4116
4117static int
4118unicode_decode_call_errorhandler_writer(
4119 const char *errors, PyObject **errorHandler,
4120 const char *encoding, const char *reason,
4121 const char **input, const char **inend, Py_ssize_t *startinpos,
4122 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4123 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4124{
4125 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4126
4127 PyObject *restuple = NULL;
4128 PyObject *repunicode = NULL;
4129 Py_ssize_t insize;
4130 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004131 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004132 PyObject *inputobj = NULL;
4133
4134 if (*errorHandler == NULL) {
4135 *errorHandler = PyCodec_LookupError(errors);
4136 if (*errorHandler == NULL)
4137 goto onError;
4138 }
4139
4140 make_decode_exception(exceptionObject,
4141 encoding,
4142 *input, *inend - *input,
4143 *startinpos, *endinpos,
4144 reason);
4145 if (*exceptionObject == NULL)
4146 goto onError;
4147
4148 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4149 if (restuple == NULL)
4150 goto onError;
4151 if (!PyTuple_Check(restuple)) {
4152 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4153 goto onError;
4154 }
4155 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004156 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
4158 /* Copy back the bytes variables, which might have been modified by the
4159 callback */
4160 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4161 if (!inputobj)
4162 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004163 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004165 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004166 *input = PyBytes_AS_STRING(inputobj);
4167 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004168 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004169 /* we can DECREF safely, as the exception has another reference,
4170 so the object won't go away. */
4171 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004175 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4177 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179
Victor Stinner8f674cc2013-04-17 23:02:17 +02004180 if (PyUnicode_READY(repunicode) < 0)
4181 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004182 replen = PyUnicode_GET_LENGTH(repunicode);
4183 writer->min_length += replen;
4184 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004185 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004186 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004187 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004190 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 Py_XDECREF(restuple);
4194 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199}
4200
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004201/* --- UTF-7 Codec -------------------------------------------------------- */
4202
Antoine Pitrou244651a2009-05-04 18:56:13 +00004203/* See RFC2152 for details. We encode conservatively and decode liberally. */
4204
4205/* Three simple macros defining base-64. */
4206
4207/* Is c a base-64 character? */
4208
4209#define IS_BASE64(c) \
4210 (((c) >= 'A' && (c) <= 'Z') || \
4211 ((c) >= 'a' && (c) <= 'z') || \
4212 ((c) >= '0' && (c) <= '9') || \
4213 (c) == '+' || (c) == '/')
4214
4215/* given that c is a base-64 character, what is its base-64 value? */
4216
4217#define FROM_BASE64(c) \
4218 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4219 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4220 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4221 (c) == '+' ? 62 : 63)
4222
4223/* What is the base-64 character of the bottom 6 bits of n? */
4224
4225#define TO_BASE64(n) \
4226 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4227
4228/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4229 * decoded as itself. We are permissive on decoding; the only ASCII
4230 * byte not decoding to itself is the + which begins a base64
4231 * string. */
4232
4233#define DECODE_DIRECT(c) \
4234 ((c) <= 127 && (c) != '+')
4235
4236/* The UTF-7 encoder treats ASCII characters differently according to
4237 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4238 * the above). See RFC2152. This array identifies these different
4239 * sets:
4240 * 0 : "Set D"
4241 * alphanumeric and '(),-./:?
4242 * 1 : "Set O"
4243 * !"#$%&*;<=>@[]^_`{|}
4244 * 2 : "whitespace"
4245 * ht nl cr sp
4246 * 3 : special (must be base64 encoded)
4247 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4248 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004249
Tim Petersced69f82003-09-16 20:30:58 +00004250static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004251char utf7_category[128] = {
4252/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4253 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4254/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4255 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4256/* sp ! " # $ % & ' ( ) * + , - . / */
4257 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4258/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4260/* @ A B C D E F G H I J K L M N O */
4261 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4262/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4264/* ` a b c d e f g h i j k l m n o */
4265 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4266/* p q r s t u v w x y z { | } ~ del */
4267 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268};
4269
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270/* ENCODE_DIRECT: this character should be encoded as itself. The
4271 * answer depends on whether we are encoding set O as itself, and also
4272 * on whether we are encoding whitespace as itself. RFC2152 makes it
4273 * clear that the answers to these questions vary between
4274 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004275
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276#define ENCODE_DIRECT(c, directO, directWS) \
4277 ((c) < 128 && (c) > 0 && \
4278 ((utf7_category[(c)] == 0) || \
4279 (directWS && (utf7_category[(c)] == 2)) || \
4280 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004281
Alexander Belopolsky40018472011-02-26 01:02:56 +00004282PyObject *
4283PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004284 Py_ssize_t size,
4285 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004287 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4288}
4289
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290/* The decoder. The only state we preserve is our read position,
4291 * i.e. how many characters we have consumed. So if we end in the
4292 * middle of a shift sequence we have to back off the read position
4293 * and the output to the beginning of the sequence, otherwise we lose
4294 * all the shift state (seen bits, number of bits seen, high
4295 * surrogate). */
4296
Alexander Belopolsky40018472011-02-26 01:02:56 +00004297PyObject *
4298PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004299 Py_ssize_t size,
4300 const char *errors,
4301 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004304 Py_ssize_t startinpos;
4305 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004307 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308 const char *errmsg = "";
4309 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004310 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 unsigned int base64bits = 0;
4312 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004313 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 PyObject *errorHandler = NULL;
4315 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317 if (size == 0) {
4318 if (consumed)
4319 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004320 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004321 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004324 _PyUnicodeWriter_Init(&writer);
4325 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326
4327 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328 e = s + size;
4329
4330 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004333 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335 if (inShift) { /* in a base-64 section */
4336 if (IS_BASE64(ch)) { /* consume a base-64 character */
4337 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4338 base64bits += 6;
4339 s++;
4340 if (base64bits >= 16) {
4341 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004342 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 base64bits -= 16;
4344 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004345 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004346 if (surrogate) {
4347 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004348 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4349 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004350 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004353 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 }
4355 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004356 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004357 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
4360 }
Victor Stinner551ac952011-11-29 22:58:13 +01004361 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 /* first surrogate */
4363 surrogate = outCh;
4364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004366 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
4369 }
4370 }
4371 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 inShift = 0;
4373 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004375 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004376 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004377 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (base64bits > 0) { /* left-over bits */
4380 if (base64bits >= 6) {
4381 /* We've seen at least one base-64 character */
4382 errmsg = "partial character in shift sequence";
4383 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 else {
4386 /* Some bits remain; they should be zero */
4387 if (base64buffer != 0) {
4388 errmsg = "non-zero padding bits in shift sequence";
4389 goto utf7Error;
4390 }
4391 }
4392 }
4393 if (ch != '-') {
4394 /* '-' is absorbed; other terminating
4395 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004396 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004397 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
4400 }
4401 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 s++; /* consume '+' */
4404 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004406 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 }
4409 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004413 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
4415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004418 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else {
4422 startinpos = s-starts;
4423 s++;
4424 errmsg = "unexpected special character";
4425 goto utf7Error;
4426 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 errors, &errorHandler,
4432 "utf7", errmsg,
4433 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004434 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436 }
4437
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 /* end of string */
4439
4440 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4441 /* if we're in an inconsistent state, that's an error */
4442 if (surrogate ||
4443 (base64bits >= 6) ||
4444 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 errors, &errorHandler,
4448 "utf7", "unterminated shift sequence",
4449 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 goto onError;
4452 if (s < e)
4453 goto restart;
4454 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456
4457 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004461 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004462 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004463 writer.kind, writer.data, shiftOutStart);
4464 Py_XDECREF(errorHandler);
4465 Py_XDECREF(exc);
4466 _PyUnicodeWriter_Dealloc(&writer);
4467 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004468 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004469 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 }
4471 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004474 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(errorHandler);
4477 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004478 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 Py_XDECREF(errorHandler);
4482 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 return NULL;
4485}
4486
4487
Alexander Belopolsky40018472011-02-26 01:02:56 +00004488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489_PyUnicode_EncodeUTF7(PyObject *str,
4490 int base64SetO,
4491 int base64WhiteSpace,
4492 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004494 int kind;
4495 void *data;
4496 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004497 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 unsigned int base64bits = 0;
4501 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 char * out;
4503 char * start;
4504
Benjamin Petersonbac79492012-01-14 13:34:47 -05004505 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004506 return NULL;
4507 kind = PyUnicode_KIND(str);
4508 data = PyUnicode_DATA(str);
4509 len = PyUnicode_GET_LENGTH(str);
4510
4511 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004514 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004515 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004516 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004517 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 if (v == NULL)
4519 return NULL;
4520
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004521 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004522 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004523 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 if (inShift) {
4526 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4527 /* shifting out */
4528 if (base64bits) { /* output remaining bits */
4529 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4530 base64buffer = 0;
4531 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 }
4533 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 /* Characters not in the BASE64 set implicitly unshift the sequence
4535 so no '-' is required, except if the character is itself a '-' */
4536 if (IS_BASE64(ch) || ch == '-') {
4537 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 *out++ = (char) ch;
4540 }
4541 else {
4542 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004543 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 else { /* not in a shift sequence */
4546 if (ch == '+') {
4547 *out++ = '+';
4548 *out++ = '-';
4549 }
4550 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4551 *out++ = (char) ch;
4552 }
4553 else {
4554 *out++ = '+';
4555 inShift = 1;
4556 goto encode_char;
4557 }
4558 }
4559 continue;
4560encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004562 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004563
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 /* code first surrogate */
4565 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004566 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 while (base64bits >= 6) {
4568 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4569 base64bits -= 6;
4570 }
4571 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004572 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits += 16;
4575 base64buffer = (base64buffer << 16) | ch;
4576 while (base64bits >= 6) {
4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4578 base64bits -= 6;
4579 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004580 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (base64bits)
4582 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4583 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004585 if (_PyBytes_Resize(&v, out - start) < 0)
4586 return NULL;
4587 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589PyObject *
4590PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4591 Py_ssize_t size,
4592 int base64SetO,
4593 int base64WhiteSpace,
4594 const char *errors)
4595{
4596 PyObject *result;
4597 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4598 if (tmp == NULL)
4599 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004600 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004601 base64WhiteSpace, errors);
4602 Py_DECREF(tmp);
4603 return result;
4604}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606#undef IS_BASE64
4607#undef FROM_BASE64
4608#undef TO_BASE64
4609#undef DECODE_DIRECT
4610#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612/* --- UTF-8 Codec -------------------------------------------------------- */
4613
Alexander Belopolsky40018472011-02-26 01:02:56 +00004614PyObject *
4615PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004616 Py_ssize_t size,
4617 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618{
Walter Dörwald69652032004-09-07 20:24:22 +00004619 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4620}
4621
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004622#include "stringlib/asciilib.h"
4623#include "stringlib/codecs.h"
4624#include "stringlib/undef.h"
4625
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004626#include "stringlib/ucs1lib.h"
4627#include "stringlib/codecs.h"
4628#include "stringlib/undef.h"
4629
4630#include "stringlib/ucs2lib.h"
4631#include "stringlib/codecs.h"
4632#include "stringlib/undef.h"
4633
4634#include "stringlib/ucs4lib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
Antoine Pitrouab868312009-01-10 15:40:25 +00004638/* Mask to quickly check whether a C 'long' contains a
4639 non-ASCII, UTF8-encoded char. */
4640#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004641# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004642#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004643# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004644#else
4645# error C 'long' size should be either 4 or 8!
4646#endif
4647
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648static Py_ssize_t
4649ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004652 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004654 /*
4655 * Issue #17237: m68k is a bit different from most architectures in
4656 * that objects do not use "natural alignment" - for example, int and
4657 * long are only aligned at 2-byte boundaries. Therefore the assert()
4658 * won't work; also, tests have shown that skipping the "optimised
4659 * version" will even speed up m68k.
4660 */
4661#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004663 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4664 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665 /* Fast path, see in STRINGLIB(utf8_decode) for
4666 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004667 /* Help allocation */
4668 const char *_p = p;
4669 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 while (_p < aligned_end) {
4671 unsigned long value = *(const unsigned long *) _p;
4672 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 *((unsigned long *)q) = value;
4675 _p += SIZEOF_LONG;
4676 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004677 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678 p = _p;
4679 while (p < end) {
4680 if ((unsigned char)*p & 0x80)
4681 break;
4682 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004687#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 while (p < end) {
4689 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4690 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004691 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004692 /* Help allocation */
4693 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004694 while (_p < aligned_end) {
4695 unsigned long value = *(unsigned long *) _p;
4696 if (value & ASCII_CHAR_MASK)
4697 break;
4698 _p += SIZEOF_LONG;
4699 }
4700 p = _p;
4701 if (_p == end)
4702 break;
4703 }
4704 if ((unsigned char)*p & 0x80)
4705 break;
4706 ++p;
4707 }
4708 memcpy(dest, start, p - start);
4709 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710}
Antoine Pitrouab868312009-01-10 15:40:25 +00004711
Victor Stinner785938e2011-12-11 20:09:03 +01004712PyObject *
4713PyUnicode_DecodeUTF8Stateful(const char *s,
4714 Py_ssize_t size,
4715 const char *errors,
4716 Py_ssize_t *consumed)
4717{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004719 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721
4722 Py_ssize_t startinpos;
4723 Py_ssize_t endinpos;
4724 const char *errmsg = "";
4725 PyObject *errorHandler = NULL;
4726 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004727
4728 if (size == 0) {
4729 if (consumed)
4730 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004731 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004732 }
4733
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4735 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004736 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737 *consumed = 1;
4738 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004739 }
4740
Victor Stinner8f674cc2013-04-17 23:02:17 +02004741 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004742 writer.min_length = size;
4743 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004744 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004745
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 writer.pos = ascii_decode(s, end, writer.data);
4747 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 while (s < end) {
4749 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004750 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004752 if (PyUnicode_IS_ASCII(writer.buffer))
4753 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004757 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 } else {
4759 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004760 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 }
4762
4763 switch (ch) {
4764 case 0:
4765 if (s == end || consumed)
4766 goto End;
4767 errmsg = "unexpected end of data";
4768 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004769 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 break;
4771 case 1:
4772 errmsg = "invalid start byte";
4773 startinpos = s - starts;
4774 endinpos = startinpos + 1;
4775 break;
4776 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004777 case 3:
4778 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 errmsg = "invalid continuation byte";
4780 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004781 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 break;
4783 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004784 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785 goto onError;
4786 continue;
4787 }
4788
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004789 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790 errors, &errorHandler,
4791 "utf-8", errmsg,
4792 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004793 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004795 }
4796
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 if (consumed)
4799 *consumed = s - starts;
4800
4801 Py_XDECREF(errorHandler);
4802 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004803 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804
4805onError:
4806 Py_XDECREF(errorHandler);
4807 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004810}
4811
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004812#ifdef __APPLE__
4813
4814/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004815 used to decode the command line arguments on Mac OS X.
4816
4817 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004818 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004819
4820wchar_t*
4821_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4822{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824 wchar_t *unicode;
4825 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004826
4827 /* Note: size will always be longer than the resulting Unicode
4828 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004829 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004830 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004831 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 if (!unicode)
4833 return NULL;
4834
4835 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004842#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845 if (ch > 0xFF) {
4846#if SIZEOF_WCHAR_T == 4
4847 assert(0);
4848#else
4849 assert(Py_UNICODE_IS_SURROGATE(ch));
4850 /* compute and append the two surrogates: */
4851 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4852 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4853#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 else {
4856 if (!ch && s == e)
4857 break;
4858 /* surrogateescape */
4859 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4860 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004861 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 return unicode;
4864}
4865
4866#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868/* Primary internal function which creates utf8 encoded bytes objects.
4869
4870 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004871 and allocate exactly as much space needed at the end. Else allocate the
4872 maximum possible needed (4 result bytes per Unicode character), and return
4873 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004874*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004875PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004876_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877{
Victor Stinner6099a032011-12-18 14:22:26 +01004878 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879 void *data;
4880 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 if (!PyUnicode_Check(unicode)) {
4883 PyErr_BadArgument();
4884 return NULL;
4885 }
4886
4887 if (PyUnicode_READY(unicode) == -1)
4888 return NULL;
4889
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004890 if (PyUnicode_UTF8(unicode))
4891 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4892 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
4894 kind = PyUnicode_KIND(unicode);
4895 data = PyUnicode_DATA(unicode);
4896 size = PyUnicode_GET_LENGTH(unicode);
4897
Benjamin Petersonead6b532011-12-20 17:23:42 -06004898 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004899 default:
4900 assert(0);
4901 case PyUnicode_1BYTE_KIND:
4902 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4903 assert(!PyUnicode_IS_ASCII(unicode));
4904 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4905 case PyUnicode_2BYTE_KIND:
4906 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4907 case PyUnicode_4BYTE_KIND:
4908 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910}
4911
Alexander Belopolsky40018472011-02-26 01:02:56 +00004912PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004913PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4914 Py_ssize_t size,
4915 const char *errors)
4916{
4917 PyObject *v, *unicode;
4918
4919 unicode = PyUnicode_FromUnicode(s, size);
4920 if (unicode == NULL)
4921 return NULL;
4922 v = _PyUnicode_AsUTF8String(unicode, errors);
4923 Py_DECREF(unicode);
4924 return v;
4925}
4926
4927PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004928PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004930 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931}
4932
Walter Dörwald41980ca2007-08-16 21:55:45 +00004933/* --- UTF-32 Codec ------------------------------------------------------- */
4934
4935PyObject *
4936PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 Py_ssize_t size,
4938 const char *errors,
4939 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940{
4941 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4942}
4943
4944PyObject *
4945PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors,
4948 int *byteorder,
4949 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950{
4951 const char *starts = s;
4952 Py_ssize_t startinpos;
4953 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004954 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004955 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004956 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004957 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 PyObject *errorHandler = NULL;
4960 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004961
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962 q = (unsigned char *)s;
4963 e = q + size;
4964
4965 if (byteorder)
4966 bo = *byteorder;
4967
4968 /* Check for BOM marks (U+FEFF) in the input and adjust current
4969 byte order setting accordingly. In native mode, the leading BOM
4970 mark is skipped, in all other modes, it is copied to the output
4971 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004972 if (bo == 0 && size >= 4) {
4973 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4974 if (bom == 0x0000FEFF) {
4975 bo = -1;
4976 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004978 else if (bom == 0xFFFE0000) {
4979 bo = 1;
4980 q += 4;
4981 }
4982 if (byteorder)
4983 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 }
4985
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 if (q == e) {
4987 if (consumed)
4988 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004989 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990 }
4991
Victor Stinnere64322e2012-10-30 23:12:47 +01004992#ifdef WORDS_BIGENDIAN
4993 le = bo < 0;
4994#else
4995 le = bo <= 0;
4996#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004997 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004998
Victor Stinner8f674cc2013-04-17 23:02:17 +02004999 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005000 writer.min_length = (e - q + 3) / 4;
5001 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005003
Victor Stinnere64322e2012-10-30 23:12:47 +01005004 while (1) {
5005 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005007
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 enum PyUnicode_Kind kind = writer.kind;
5010 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005011 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 if (le) {
5014 do {
5015 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5016 if (ch > maxch)
5017 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005018 if (kind != PyUnicode_1BYTE_KIND &&
5019 Py_UNICODE_IS_SURROGATE(ch))
5020 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005021 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 q += 4;
5023 } while (q <= last);
5024 }
5025 else {
5026 do {
5027 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5028 if (ch > maxch)
5029 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005030 if (kind != PyUnicode_1BYTE_KIND &&
5031 Py_UNICODE_IS_SURROGATE(ch))
5032 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005033 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 q += 4;
5035 } while (q <= last);
5036 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005037 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005038 }
5039
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005040 if (Py_UNICODE_IS_SURROGATE(ch)) {
5041 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5042 startinpos = ((const char *)q) - starts;
5043 endinpos = startinpos + 4;
5044 }
5045 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005046 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005048 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005050 startinpos = ((const char *)q) - starts;
5051 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005053 else {
5054 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005055 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005056 goto onError;
5057 q += 4;
5058 continue;
5059 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005061 startinpos = ((const char *)q) - starts;
5062 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005064
5065 /* The remaining input chars are ignored if the callback
5066 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005067 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005069 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005071 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073 }
5074
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 Py_XDECREF(errorHandler);
5079 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005080 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005083 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 Py_XDECREF(errorHandler);
5085 Py_XDECREF(exc);
5086 return NULL;
5087}
5088
5089PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005090_PyUnicode_EncodeUTF32(PyObject *str,
5091 const char *errors,
5092 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005094 int kind;
5095 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005096 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005098 unsigned char *p;
5099 Py_ssize_t nsize, i;
5100 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005101#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005102 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005104 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005106 const char *encoding;
5107 PyObject *errorHandler = NULL;
5108 PyObject *exc = NULL;
5109 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110
Serhiy Storchaka30793282014-01-04 22:44:01 +02005111#define STORECHAR(CH) \
5112 do { \
5113 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5114 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5115 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5116 p[iorder[0]] = (CH) & 0xff; \
5117 p += 4; \
5118 } while(0)
5119
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (!PyUnicode_Check(str)) {
5121 PyErr_BadArgument();
5122 return NULL;
5123 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005124 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005125 return NULL;
5126 kind = PyUnicode_KIND(str);
5127 data = PyUnicode_DATA(str);
5128 len = PyUnicode_GET_LENGTH(str);
5129
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005130 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005131 if (nsize > PY_SSIZE_T_MAX / 4)
5132 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005133 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (v == NULL)
5135 return NULL;
5136
Serhiy Storchaka30793282014-01-04 22:44:01 +02005137 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005139 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005140 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005141 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142
Serhiy Storchaka30793282014-01-04 22:44:01 +02005143 if (byteorder == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005149 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005150 }
5151 else if (byteorder == 1) {
5152 /* force BE */
5153 iorder[0] = 3;
5154 iorder[1] = 2;
5155 iorder[2] = 1;
5156 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005157 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005158 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005159 else
5160 encoding = "utf-32";
5161
5162 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005163 for (i = 0; i < len; i++)
5164 STORECHAR(PyUnicode_READ(kind, data, i));
5165 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166 }
5167
Serhiy Storchaka30793282014-01-04 22:44:01 +02005168 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005169 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005170 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5171 i++;
5172 assert(ch <= MAX_UNICODE);
5173 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5174 STORECHAR(ch);
5175 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005177
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005178 rep = unicode_encode_call_errorhandler(
5179 errors, &errorHandler,
5180 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005181 str, &exc, i-1, i, &i);
5182
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 if (!rep)
5184 goto error;
5185
5186 if (PyBytes_Check(rep)) {
5187 repsize = PyBytes_GET_SIZE(rep);
5188 if (repsize & 3) {
5189 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005190 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 "surrogates not allowed");
5192 goto error;
5193 }
5194 moreunits = repsize / 4;
5195 }
5196 else {
5197 assert(PyUnicode_Check(rep));
5198 if (PyUnicode_READY(rep) < 0)
5199 goto error;
5200 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5201 if (!PyUnicode_IS_ASCII(rep)) {
5202 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005203 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005204 "surrogates not allowed");
5205 goto error;
5206 }
5207 }
5208
5209 /* four bytes are reserved for each surrogate */
5210 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005211 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005212 Py_ssize_t morebytes = 4 * (moreunits - 1);
5213 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5214 /* integer overflow */
5215 PyErr_NoMemory();
5216 goto error;
5217 }
5218 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5219 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005220 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005221 }
5222
5223 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005224 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5225 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005226 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005227 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005228 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005229 repdata = PyUnicode_1BYTE_DATA(rep);
5230 while (repsize--) {
5231 Py_UCS4 ch = *repdata++;
5232 STORECHAR(ch);
5233 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005234 }
5235
5236 Py_CLEAR(rep);
5237 }
5238
5239 /* Cut back to size actually needed. This is necessary for, for example,
5240 encoding of a string containing isolated surrogates and the 'ignore'
5241 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005242 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005243 if (nsize != PyBytes_GET_SIZE(v))
5244 _PyBytes_Resize(&v, nsize);
5245 Py_XDECREF(errorHandler);
5246 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005247 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005248 error:
5249 Py_XDECREF(rep);
5250 Py_XDECREF(errorHandler);
5251 Py_XDECREF(exc);
5252 Py_XDECREF(v);
5253 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005254#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255}
5256
Alexander Belopolsky40018472011-02-26 01:02:56 +00005257PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005258PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5259 Py_ssize_t size,
5260 const char *errors,
5261 int byteorder)
5262{
5263 PyObject *result;
5264 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5265 if (tmp == NULL)
5266 return NULL;
5267 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5268 Py_DECREF(tmp);
5269 return result;
5270}
5271
5272PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005273PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274{
Victor Stinnerb960b342011-11-20 19:12:52 +01005275 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276}
5277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278/* --- UTF-16 Codec ------------------------------------------------------- */
5279
Tim Peters772747b2001-08-09 22:21:55 +00005280PyObject *
5281PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 Py_ssize_t size,
5283 const char *errors,
5284 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
Walter Dörwald69652032004-09-07 20:24:22 +00005286 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5287}
5288
5289PyObject *
5290PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 Py_ssize_t size,
5292 const char *errors,
5293 int *byteorder,
5294 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t startinpos;
5298 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005300 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005301 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005302 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005303 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 PyObject *errorHandler = NULL;
5305 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005306 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Tim Peters772747b2001-08-09 22:21:55 +00005308 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005309 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
5311 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005312 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005314 /* Check for BOM marks (U+FEFF) in the input and adjust current
5315 byte order setting accordingly. In native mode, the leading BOM
5316 mark is skipped, in all other modes, it is copied to the output
5317 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 if (bo == 0 && size >= 2) {
5319 const Py_UCS4 bom = (q[1] << 8) | q[0];
5320 if (bom == 0xFEFF) {
5321 q += 2;
5322 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005324 else if (bom == 0xFFFE) {
5325 q += 2;
5326 bo = 1;
5327 }
5328 if (byteorder)
5329 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 if (q == e) {
5333 if (consumed)
5334 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005335 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005336 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005337
Christian Heimes743e0cd2012-10-17 23:52:17 +02005338#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005339 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005340 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005341#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005344#endif
Tim Peters772747b2001-08-09 22:21:55 +00005345
Antoine Pitrou63065d72012-05-15 23:48:04 +02005346 /* Note: size will always be longer than the resulting Unicode
5347 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005348 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005349 writer.min_length = (e - q + 1) / 2;
5350 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005351 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005352
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 while (1) {
5354 Py_UCS4 ch = 0;
5355 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005356 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005357 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005359 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005361 native_ordering);
5362 else
5363 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005365 native_ordering);
5366 } else if (kind == PyUnicode_2BYTE_KIND) {
5367 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005369 native_ordering);
5370 } else {
5371 assert(kind == PyUnicode_4BYTE_KIND);
5372 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005374 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005375 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377
Antoine Pitrou63065d72012-05-15 23:48:04 +02005378 switch (ch)
5379 {
5380 case 0:
5381 /* remaining byte at the end? (size should be even) */
5382 if (q == e || consumed)
5383 goto End;
5384 errmsg = "truncated data";
5385 startinpos = ((const char *)q) - starts;
5386 endinpos = ((const char *)e) - starts;
5387 break;
5388 /* The remaining input chars are ignored if the callback
5389 chooses to skip the input */
5390 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005391 q -= 2;
5392 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005393 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005394 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005395 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005396 endinpos = ((const char *)e) - starts;
5397 break;
5398 case 2:
5399 errmsg = "illegal encoding";
5400 startinpos = ((const char *)q) - 2 - starts;
5401 endinpos = startinpos + 2;
5402 break;
5403 case 3:
5404 errmsg = "illegal UTF-16 surrogate";
5405 startinpos = ((const char *)q) - 4 - starts;
5406 endinpos = startinpos + 2;
5407 break;
5408 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005409 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005410 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 continue;
5412 }
5413
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005414 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005415 errors,
5416 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005417 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005418 &starts,
5419 (const char **)&e,
5420 &startinpos,
5421 &endinpos,
5422 &exc,
5423 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005424 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 }
5427
Antoine Pitrou63065d72012-05-15 23:48:04 +02005428End:
Walter Dörwald69652032004-09-07 20:24:22 +00005429 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 Py_XDECREF(errorHandler);
5433 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005434 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005437 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 Py_XDECREF(errorHandler);
5439 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return NULL;
5441}
5442
Tim Peters772747b2001-08-09 22:21:55 +00005443PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444_PyUnicode_EncodeUTF16(PyObject *str,
5445 const char *errors,
5446 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005448 enum PyUnicode_Kind kind;
5449 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005450 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005451 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005452 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005453 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005454#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005455 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005456#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005457 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005458#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 const char *encoding;
5460 Py_ssize_t nsize, pos;
5461 PyObject *errorHandler = NULL;
5462 PyObject *exc = NULL;
5463 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 if (!PyUnicode_Check(str)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005469 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005470 return NULL;
5471 kind = PyUnicode_KIND(str);
5472 data = PyUnicode_DATA(str);
5473 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005474
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005476 if (kind == PyUnicode_4BYTE_KIND) {
5477 const Py_UCS4 *in = (const Py_UCS4 *)data;
5478 const Py_UCS4 *end = in + len;
5479 while (in < end)
5480 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005481 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005482 }
5483 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 nsize = len + pairs + (byteorder == 0);
5486 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 if (v == NULL)
5488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005490 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005491 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005492 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005494 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005495 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005496 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005497
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 if (kind == PyUnicode_1BYTE_KIND) {
5499 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5500 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005501 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005502
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 if (byteorder < 0)
5504 encoding = "utf-16-le";
5505 else if (byteorder > 0)
5506 encoding = "utf-16-be";
5507 else
5508 encoding = "utf-16";
5509
5510 pos = 0;
5511 while (pos < len) {
5512 Py_ssize_t repsize, moreunits;
5513
5514 if (kind == PyUnicode_2BYTE_KIND) {
5515 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5516 &out, native_ordering);
5517 }
5518 else {
5519 assert(kind == PyUnicode_4BYTE_KIND);
5520 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5521 &out, native_ordering);
5522 }
5523 if (pos == len)
5524 break;
5525
5526 rep = unicode_encode_call_errorhandler(
5527 errors, &errorHandler,
5528 encoding, "surrogates not allowed",
5529 str, &exc, pos, pos + 1, &pos);
5530 if (!rep)
5531 goto error;
5532
5533 if (PyBytes_Check(rep)) {
5534 repsize = PyBytes_GET_SIZE(rep);
5535 if (repsize & 1) {
5536 raise_encode_exception(&exc, encoding,
5537 str, pos - 1, pos,
5538 "surrogates not allowed");
5539 goto error;
5540 }
5541 moreunits = repsize / 2;
5542 }
5543 else {
5544 assert(PyUnicode_Check(rep));
5545 if (PyUnicode_READY(rep) < 0)
5546 goto error;
5547 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5548 if (!PyUnicode_IS_ASCII(rep)) {
5549 raise_encode_exception(&exc, encoding,
5550 str, pos - 1, pos,
5551 "surrogates not allowed");
5552 goto error;
5553 }
5554 }
5555
5556 /* two bytes are reserved for each surrogate */
5557 if (moreunits > 1) {
5558 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5559 Py_ssize_t morebytes = 2 * (moreunits - 1);
5560 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5561 /* integer overflow */
5562 PyErr_NoMemory();
5563 goto error;
5564 }
5565 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5566 goto error;
5567 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5568 }
5569
5570 if (PyBytes_Check(rep)) {
5571 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5572 out += moreunits;
5573 } else /* rep is unicode */ {
5574 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5575 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5576 &out, native_ordering);
5577 }
5578
5579 Py_CLEAR(rep);
5580 }
5581
5582 /* Cut back to size actually needed. This is necessary for, for example,
5583 encoding of a string containing isolated surrogates and the 'ignore' handler
5584 is used. */
5585 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5586 if (nsize != PyBytes_GET_SIZE(v))
5587 _PyBytes_Resize(&v, nsize);
5588 Py_XDECREF(errorHandler);
5589 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005590 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005591 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005592 error:
5593 Py_XDECREF(rep);
5594 Py_XDECREF(errorHandler);
5595 Py_XDECREF(exc);
5596 Py_XDECREF(v);
5597 return NULL;
5598#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599}
5600
Alexander Belopolsky40018472011-02-26 01:02:56 +00005601PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005602PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5603 Py_ssize_t size,
5604 const char *errors,
5605 int byteorder)
5606{
5607 PyObject *result;
5608 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5609 if (tmp == NULL)
5610 return NULL;
5611 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5612 Py_DECREF(tmp);
5613 return result;
5614}
5615
5616PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005617PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620}
5621
5622/* --- Unicode Escape Codec ----------------------------------------------- */
5623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5625 if all the escapes in the string make it still a valid ASCII string.
5626 Returns -1 if any escapes were found which cause the string to
5627 pop out of ASCII range. Otherwise returns the length of the
5628 required buffer to hold the string.
5629 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005630static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5632{
5633 const unsigned char *p = (const unsigned char *)s;
5634 const unsigned char *end = p + size;
5635 Py_ssize_t length = 0;
5636
5637 if (size < 0)
5638 return -1;
5639
5640 for (; p < end; ++p) {
5641 if (*p > 127) {
5642 /* Non-ASCII */
5643 return -1;
5644 }
5645 else if (*p != '\\') {
5646 /* Normal character */
5647 ++length;
5648 }
5649 else {
5650 /* Backslash-escape, check next char */
5651 ++p;
5652 /* Escape sequence reaches till end of string or
5653 non-ASCII follow-up. */
5654 if (p >= end || *p > 127)
5655 return -1;
5656 switch (*p) {
5657 case '\n':
5658 /* backslash + \n result in zero characters */
5659 break;
5660 case '\\': case '\'': case '\"':
5661 case 'b': case 'f': case 't':
5662 case 'n': case 'r': case 'v': case 'a':
5663 ++length;
5664 break;
5665 case '0': case '1': case '2': case '3':
5666 case '4': case '5': case '6': case '7':
5667 case 'x': case 'u': case 'U': case 'N':
5668 /* these do not guarantee ASCII characters */
5669 return -1;
5670 default:
5671 /* count the backslash + the other character */
5672 length += 2;
5673 }
5674 }
5675 }
5676 return length;
5677}
5678
Fredrik Lundh06d12682001-01-24 07:59:11 +00005679static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005680
Alexander Belopolsky40018472011-02-26 01:02:56 +00005681PyObject *
5682PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005683 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005684 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005687 Py_ssize_t startinpos;
5688 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 char* message;
5692 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 PyObject *errorHandler = NULL;
5694 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005696
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005697 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005698 if (len == 0)
5699 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005700
5701 /* After length_of_escaped_ascii_string() there are two alternatives,
5702 either the string is pure ASCII with named escapes like \n, etc.
5703 and we determined it's exact size (common case)
5704 or it contains \x, \u, ... escape sequences. then we create a
5705 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005706 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005708 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 }
5710 else {
5711 /* Escaped strings will always be longer than the resulting
5712 Unicode string, so we start with size here and then reduce the
5713 length after conversion to the true value.
5714 (but if the error callback returns a long replacement string
5715 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005716 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005717 }
5718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005720 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005722
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 while (s < end) {
5724 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005725 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
5728 /* Non-escape characters are interpreted as Unicode ordinals */
5729 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005730 x = (unsigned char)*s;
5731 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005732 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 continue;
5735 }
5736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 /* \ - Escapes */
5739 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005740 c = *s++;
5741 if (s > end)
5742 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005743
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005744 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005747#define WRITECHAR(ch) \
5748 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005749 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005751 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005752
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005754 case '\\': WRITECHAR('\\'); break;
5755 case '\'': WRITECHAR('\''); break;
5756 case '\"': WRITECHAR('\"'); break;
5757 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005758 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 case 'f': WRITECHAR('\014'); break;
5760 case 't': WRITECHAR('\t'); break;
5761 case 'n': WRITECHAR('\n'); break;
5762 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005763 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005764 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005765 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 case '0': case '1': case '2': case '3':
5770 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005771 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005772 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005773 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005774 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005775 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 break;
5779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* hex escapes */
5781 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005783 digits = 2;
5784 message = "truncated \\xXX escape";
5785 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005789 digits = 4;
5790 message = "truncated \\uXXXX escape";
5791 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005794 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005795 digits = 8;
5796 message = "truncated \\UXXXXXXXX escape";
5797 hexescape:
5798 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005799 if (end - s < digits) {
5800 /* count only hex digits */
5801 for (; s < end; ++s) {
5802 c = (unsigned char)*s;
5803 if (!Py_ISXDIGIT(c))
5804 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005805 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005806 goto error;
5807 }
5808 for (; digits--; ++s) {
5809 c = (unsigned char)*s;
5810 if (!Py_ISXDIGIT(c))
5811 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005812 chr = (chr<<4) & ~0xF;
5813 if (c >= '0' && c <= '9')
5814 chr += c - '0';
5815 else if (c >= 'a' && c <= 'f')
5816 chr += 10 + c - 'a';
5817 else
5818 chr += 10 + c - 'A';
5819 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005820 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 /* _decoding_error will have already written into the
5822 target buffer. */
5823 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005824 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005825 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005826 message = "illegal Unicode character";
5827 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005828 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005829 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 break;
5831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 case 'N':
5834 message = "malformed \\N character escape";
5835 if (ucnhash_CAPI == NULL) {
5836 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5838 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 if (ucnhash_CAPI == NULL)
5840 goto ucnhashError;
5841 }
5842 if (*s == '{') {
5843 const char *start = s+1;
5844 /* look for the closing brace */
5845 while (*s != '}' && s < end)
5846 s++;
5847 if (s > start && s < end && *s == '}') {
5848 /* found a name. look it up in the unicode database */
5849 message = "unknown Unicode character name";
5850 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005851 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005852 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005853 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005854 goto store;
5855 }
5856 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005857 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005858
5859 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005860 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 message = "\\ at end of string";
5862 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005863 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005864 }
5865 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005866 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005867 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005868 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005869 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005871 continue;
5872
5873 error:
5874 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005875 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005876 errors, &errorHandler,
5877 "unicodeescape", message,
5878 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005879 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005880 goto onError;
5881 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005883#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005884
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005885 Py_XDECREF(errorHandler);
5886 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005887 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005888
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005890 PyErr_SetString(
5891 PyExc_UnicodeError,
5892 "\\N escapes not supported (can't load unicodedata module)"
5893 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005897 return NULL;
5898
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 Py_XDECREF(errorHandler);
5902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 return NULL;
5904}
5905
5906/* Return a Unicode-Escape string version of the Unicode object.
5907
5908 If quotes is true, the string is enclosed in u"" or u'' quotes as
5909 appropriate.
5910
5911*/
5912
Alexander Belopolsky40018472011-02-26 01:02:56 +00005913PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005917 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 int kind;
5920 void *data;
5921 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922
Ezio Melottie7f90372012-10-05 03:33:31 +03005923 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005924 escape.
5925
Ezio Melottie7f90372012-10-05 03:33:31 +03005926 For UCS1 strings it's '\xxx', 4 bytes per source character.
5927 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5928 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005929 */
5930
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 if (!PyUnicode_Check(unicode)) {
5932 PyErr_BadArgument();
5933 return NULL;
5934 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005935 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936 return NULL;
5937 len = PyUnicode_GET_LENGTH(unicode);
5938 kind = PyUnicode_KIND(unicode);
5939 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005940 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5942 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5943 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5944 }
5945
5946 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 return PyBytes_FromStringAndSize(NULL, 0);
5948
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005951
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 if (repr == NULL)
5957 return NULL;
5958
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005959 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005962 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005963
Walter Dörwald79e913e2007-05-12 11:08:06 +00005964 /* Escape backslashes */
5965 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 *p++ = '\\';
5967 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005968 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005971 /* Map 21-bit characters to '\U00xxxxxx' */
5972 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005973 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005974 *p++ = '\\';
5975 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005976 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5982 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5983 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005985 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005988 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 *p++ = '\\';
5990 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005991 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5992 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5993 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5994 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005996
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005997 /* Map special whitespace to '\t', \n', '\r' */
5998 else if (ch == '\t') {
5999 *p++ = '\\';
6000 *p++ = 't';
6001 }
6002 else if (ch == '\n') {
6003 *p++ = '\\';
6004 *p++ = 'n';
6005 }
6006 else if (ch == '\r') {
6007 *p++ = '\\';
6008 *p++ = 'r';
6009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006010
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006011 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006012 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006014 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006015 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6016 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006017 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006018
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 /* Copy everything else as-is */
6020 else
6021 *p++ = (char) ch;
6022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 assert(p - PyBytes_AS_STRING(repr) > 0);
6025 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6026 return NULL;
6027 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
Alexander Belopolsky40018472011-02-26 01:02:56 +00006030PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6032 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 PyObject *result;
6035 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6036 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 result = PyUnicode_AsUnicodeEscapeString(tmp);
6039 Py_DECREF(tmp);
6040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041}
6042
6043/* --- Raw Unicode Escape Codec ------------------------------------------- */
6044
Alexander Belopolsky40018472011-02-26 01:02:56 +00006045PyObject *
6046PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006047 Py_ssize_t size,
6048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006051 Py_ssize_t startinpos;
6052 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 const char *end;
6055 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 PyObject *errorHandler = NULL;
6057 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006058
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006059 if (size == 0)
6060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 /* Escaped strings will always be longer than the resulting
6063 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 length after conversion to the true value. (But decoding error
6065 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006066 _PyUnicodeWriter_Init(&writer);
6067 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 end = s + size;
6070 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 unsigned char c;
6072 Py_UCS4 x;
6073 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006074 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 /* Non-escape characters are interpreted as Unicode ordinals */
6077 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006078 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006079 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006080 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 startinpos = s-starts;
6084
6085 /* \u-escapes are only interpreted iff the number of leading
6086 backslashes if odd */
6087 bs = s;
6088 for (;s < end;) {
6089 if (*s != '\\')
6090 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006091 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006092 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006093 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 }
6095 if (((s - bs) & 1) == 0 ||
6096 s >= end ||
6097 (*s != 'u' && *s != 'U')) {
6098 continue;
6099 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006100 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 count = *s=='u' ? 4 : 8;
6102 s++;
6103
6104 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 for (x = 0, i = 0; i < count; ++i, ++s) {
6106 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006107 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006109 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 errors, &errorHandler,
6111 "rawunicodeescape", "truncated \\uXXXX",
6112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006113 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 goto onError;
6115 goto nextByte;
6116 }
6117 x = (x<<4) & ~0xF;
6118 if (c >= '0' && c <= '9')
6119 x += c - '0';
6120 else if (c >= 'a' && c <= 'f')
6121 x += 10 + c - 'a';
6122 else
6123 x += 10 + c - 'A';
6124 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006125 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006126 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006127 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006128 }
6129 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006130 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006131 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006132 errors, &errorHandler,
6133 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006135 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 nextByte:
6139 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006143 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006144
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006146 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return NULL;
6150}
6151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006156 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 char *p;
6158 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 Py_ssize_t expandsize, pos;
6160 int kind;
6161 void *data;
6162 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 if (!PyUnicode_Check(unicode)) {
6165 PyErr_BadArgument();
6166 return NULL;
6167 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006168 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 return NULL;
6170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
6172 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006173 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6174 bytes, and 1 byte characters 4. */
6175 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006176
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (repr == NULL)
6182 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006184 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006187 for (pos = 0; pos < len; pos++) {
6188 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 /* Map 32-bit characters to '\Uxxxxxxxx' */
6190 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006191 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006192 *p++ = '\\';
6193 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006194 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6195 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6199 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6201 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006202 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006204 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 *p++ = '\\';
6206 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006207 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6208 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6210 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 /* Copy everything else as-is */
6213 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = (char) ch;
6215 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006216
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217 assert(p > q);
6218 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006219 return NULL;
6220 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Alexander Belopolsky40018472011-02-26 01:02:56 +00006223PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6225 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006227 PyObject *result;
6228 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6229 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006230 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6232 Py_DECREF(tmp);
6233 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234}
6235
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236/* --- Unicode Internal Codec ------------------------------------------- */
6237
Alexander Belopolsky40018472011-02-26 01:02:56 +00006238PyObject *
6239_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006240 Py_ssize_t size,
6241 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006242{
6243 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006244 Py_ssize_t startinpos;
6245 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006246 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 const char *end;
6248 const char *reason;
6249 PyObject *errorHandler = NULL;
6250 PyObject *exc = NULL;
6251
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006253 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 1))
6255 return NULL;
6256
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006257 if (size == 0)
6258 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006259
Victor Stinner8f674cc2013-04-17 23:02:17 +02006260 _PyUnicodeWriter_Init(&writer);
6261 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6262 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006264 }
6265 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006266
Victor Stinner8f674cc2013-04-17 23:02:17 +02006267 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006269 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006270 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006271 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006272 endinpos = end-starts;
6273 reason = "truncated input";
6274 goto error;
6275 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006276 /* We copy the raw representation one byte at a time because the
6277 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006278 ((char *) &uch)[0] = s[0];
6279 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006280#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006281 ((char *) &uch)[2] = s[2];
6282 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006283#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006284 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006285#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 /* We have to sanity check the raw data, otherwise doom looms for
6287 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006288 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006289 endinpos = s - starts + Py_UNICODE_SIZE;
6290 reason = "illegal code point (> 0x10FFFF)";
6291 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006292 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006293#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294 s += Py_UNICODE_SIZE;
6295#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006296 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006297 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006298 Py_UNICODE uch2;
6299 ((char *) &uch2)[0] = s[0];
6300 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006301 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006302 {
Victor Stinner551ac952011-11-29 22:58:13 +01006303 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305 }
6306 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006307#endif
6308
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006309 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006310 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006311 continue;
6312
6313 error:
6314 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006315 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006316 errors, &errorHandler,
6317 "unicode_internal", reason,
6318 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006319 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006320 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006321 }
6322
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 Py_XDECREF(errorHandler);
6324 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006325 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006328 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 Py_XDECREF(errorHandler);
6330 Py_XDECREF(exc);
6331 return NULL;
6332}
6333
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334/* --- Latin-1 Codec ------------------------------------------------------ */
6335
Alexander Belopolsky40018472011-02-26 01:02:56 +00006336PyObject *
6337PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006338 Py_ssize_t size,
6339 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006342 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006346static void
6347make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006348 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006349 PyObject *unicode,
6350 Py_ssize_t startpos, Py_ssize_t endpos,
6351 const char *reason)
6352{
6353 if (*exceptionObject == NULL) {
6354 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006355 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006356 encoding, unicode, startpos, endpos, reason);
6357 }
6358 else {
6359 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6360 goto onError;
6361 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6362 goto onError;
6363 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6364 goto onError;
6365 return;
6366 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006367 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006368 }
6369}
6370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006372static void
6373raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006374 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006375 PyObject *unicode,
6376 Py_ssize_t startpos, Py_ssize_t endpos,
6377 const char *reason)
6378{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006379 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006380 encoding, unicode, startpos, endpos, reason);
6381 if (*exceptionObject != NULL)
6382 PyCodec_StrictErrors(*exceptionObject);
6383}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384
6385/* error handling callback helper:
6386 build arguments, call the callback and check the arguments,
6387 put the result into newpos and return the replacement string, which
6388 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006389static PyObject *
6390unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006391 PyObject **errorHandler,
6392 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006394 Py_ssize_t startpos, Py_ssize_t endpos,
6395 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006397 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 PyObject *restuple;
6400 PyObject *resunicode;
6401
6402 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 }
6407
Benjamin Petersonbac79492012-01-14 13:34:47 -05006408 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 return NULL;
6410 len = PyUnicode_GET_LENGTH(unicode);
6411
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006412 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006413 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416
6417 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006422 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 Py_DECREF(restuple);
6424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006426 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 &resunicode, newpos)) {
6428 Py_DECREF(restuple);
6429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006431 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6432 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6433 Py_DECREF(restuple);
6434 return NULL;
6435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006437 *newpos = len + *newpos;
6438 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6440 Py_DECREF(restuple);
6441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 Py_INCREF(resunicode);
6444 Py_DECREF(restuple);
6445 return resunicode;
6446}
6447
Alexander Belopolsky40018472011-02-26 01:02:56 +00006448static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006450 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006451 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453 /* input state */
6454 Py_ssize_t pos=0, size;
6455 int kind;
6456 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 /* output object */
6458 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 /* pointer into the output */
6460 char *str;
6461 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006463 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6464 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 PyObject *errorHandler = NULL;
6466 PyObject *exc = NULL;
6467 /* the following variable is used for caching string comparisons
6468 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6469 int known_errorHandler = -1;
6470
Benjamin Petersonbac79492012-01-14 13:34:47 -05006471 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 return NULL;
6473 size = PyUnicode_GET_LENGTH(unicode);
6474 kind = PyUnicode_KIND(unicode);
6475 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 /* allocate enough for a simple encoding without
6477 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006478 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006479 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006480 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006481 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006482 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006483 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 ressize = size;
6485
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 while (pos < size) {
6487 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 /* can we encode this? */
6490 if (c<limit) {
6491 /* no overflow check, because we know that the space is enough */
6492 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006494 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 Py_ssize_t requiredsize;
6497 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 Py_ssize_t collstart = pos;
6501 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 ++collend;
6505 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6506 if (known_errorHandler==-1) {
6507 if ((errors==NULL) || (!strcmp(errors, "strict")))
6508 known_errorHandler = 1;
6509 else if (!strcmp(errors, "replace"))
6510 known_errorHandler = 2;
6511 else if (!strcmp(errors, "ignore"))
6512 known_errorHandler = 3;
6513 else if (!strcmp(errors, "xmlcharrefreplace"))
6514 known_errorHandler = 4;
6515 else
6516 known_errorHandler = 0;
6517 }
6518 switch (known_errorHandler) {
6519 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006520 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 goto onError;
6522 case 2: /* replace */
6523 while (collstart++<collend)
6524 *str++ = '?'; /* fall through */
6525 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 break;
6528 case 4: /* xmlcharrefreplace */
6529 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 /* determine replacement size */
6531 for (i = collstart, repsize = 0; i < collend; ++i) {
6532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6533 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006545 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006546 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006550 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 if (requiredsize > ressize) {
6552 if (requiredsize<2*ressize)
6553 requiredsize = 2*ressize;
6554 if (_PyBytes_Resize(&res, requiredsize))
6555 goto onError;
6556 str = PyBytes_AS_STRING(res) + respos;
6557 ressize = requiredsize;
6558 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 /* generate replacement */
6560 for (i = collstart; i < collend; ++i) {
6561 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006563 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 break;
6565 default:
6566 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006567 encoding, reason, unicode, &exc,
6568 collstart, collend, &newpos);
6569 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006570 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006572 if (PyBytes_Check(repunicode)) {
6573 /* Directly copy bytes result to output. */
6574 repsize = PyBytes_Size(repunicode);
6575 if (repsize > 1) {
6576 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006577 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006578 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6579 Py_DECREF(repunicode);
6580 goto onError;
6581 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006582 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006583 ressize += repsize-1;
6584 }
6585 memcpy(str, PyBytes_AsString(repunicode), repsize);
6586 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006588 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006589 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* need more space? (at least enough for what we
6592 have+the replacement+the rest of the string, so
6593 we won't have to check space for encodable characters) */
6594 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595 repsize = PyUnicode_GET_LENGTH(repunicode);
6596 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 if (requiredsize > ressize) {
6598 if (requiredsize<2*ressize)
6599 requiredsize = 2*ressize;
6600 if (_PyBytes_Resize(&res, requiredsize)) {
6601 Py_DECREF(repunicode);
6602 goto onError;
6603 }
6604 str = PyBytes_AS_STRING(res) + respos;
6605 ressize = requiredsize;
6606 }
6607 /* check if there is anything unencodable in the replacement
6608 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006609 for (i = 0; repsize-->0; ++i, ++str) {
6610 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006612 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 Py_DECREF(repunicode);
6615 goto onError;
6616 }
6617 *str = (char)c;
6618 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006622 }
6623 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006624 /* Resize if we allocated to much */
6625 size = str - PyBytes_AS_STRING(res);
6626 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006627 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006628 if (_PyBytes_Resize(&res, size) < 0)
6629 goto onError;
6630 }
6631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 Py_XDECREF(errorHandler);
6633 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006634 return res;
6635
6636 onError:
6637 Py_XDECREF(res);
6638 Py_XDECREF(errorHandler);
6639 Py_XDECREF(exc);
6640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641}
6642
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644PyObject *
6645PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 Py_ssize_t size,
6647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 PyObject *result;
6650 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6651 if (unicode == NULL)
6652 return NULL;
6653 result = unicode_encode_ucs1(unicode, errors, 256);
6654 Py_DECREF(unicode);
6655 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656}
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
6661 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 PyErr_BadArgument();
6663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665 if (PyUnicode_READY(unicode) == -1)
6666 return NULL;
6667 /* Fast path: if it is a one-byte string, construct
6668 bytes object directly. */
6669 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6670 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6671 PyUnicode_GET_LENGTH(unicode));
6672 /* Non-Latin-1 characters present. Defer to above function to
6673 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675}
6676
6677PyObject*
6678PyUnicode_AsLatin1String(PyObject *unicode)
6679{
6680 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
6683/* --- 7-bit ASCII Codec -------------------------------------------------- */
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685PyObject *
6686PyUnicode_DecodeASCII(const char *s,
6687 Py_ssize_t size,
6688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006691 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006692 int kind;
6693 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t startinpos;
6695 Py_ssize_t endinpos;
6696 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 const char *e;
6698 PyObject *errorHandler = NULL;
6699 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006702 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006705 if (size == 1 && (unsigned char)s[0] < 128)
6706 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006707
Victor Stinner8f674cc2013-04-17 23:02:17 +02006708 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006709 writer.min_length = size;
6710 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006711 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006714 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006715 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 writer.pos = outpos;
6717 if (writer.pos == size)
6718 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006719
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006720 s += writer.pos;
6721 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006723 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006725 PyUnicode_WRITE(kind, data, writer.pos, c);
6726 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 ++s;
6728 }
6729 else {
6730 startinpos = s-starts;
6731 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006732 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 errors, &errorHandler,
6734 "ascii", "ordinal not in range(128)",
6735 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006736 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 kind = writer.kind;
6739 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 Py_XDECREF(errorHandler);
6743 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006744 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006745
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006747 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 Py_XDECREF(errorHandler);
6749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
6751}
6752
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006754PyObject *
6755PyUnicode_EncodeASCII(const Py_UNICODE *p,
6756 Py_ssize_t size,
6757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 PyObject *result;
6760 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6761 if (unicode == NULL)
6762 return NULL;
6763 result = unicode_encode_ucs1(unicode, errors, 128);
6764 Py_DECREF(unicode);
6765 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766}
6767
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006769_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 PyErr_BadArgument();
6773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 if (PyUnicode_READY(unicode) == -1)
6776 return NULL;
6777 /* Fast path: if it is an ASCII-only string, construct bytes object
6778 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006779 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6781 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783}
6784
6785PyObject *
6786PyUnicode_AsASCIIString(PyObject *unicode)
6787{
6788 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Victor Stinner99b95382011-07-04 14:23:54 +02006791#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006793/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006795#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796#define NEED_RETRY
6797#endif
6798
Victor Stinner3a50e702011-10-18 21:21:00 +02006799#ifndef WC_ERR_INVALID_CHARS
6800# define WC_ERR_INVALID_CHARS 0x0080
6801#endif
6802
6803static char*
6804code_page_name(UINT code_page, PyObject **obj)
6805{
6806 *obj = NULL;
6807 if (code_page == CP_ACP)
6808 return "mbcs";
6809 if (code_page == CP_UTF7)
6810 return "CP_UTF7";
6811 if (code_page == CP_UTF8)
6812 return "CP_UTF8";
6813
6814 *obj = PyBytes_FromFormat("cp%u", code_page);
6815 if (*obj == NULL)
6816 return NULL;
6817 return PyBytes_AS_STRING(*obj);
6818}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
Victor Stinner3a50e702011-10-18 21:21:00 +02006820static DWORD
6821decode_code_page_flags(UINT code_page)
6822{
6823 if (code_page == CP_UTF7) {
6824 /* The CP_UTF7 decoder only supports flags=0 */
6825 return 0;
6826 }
6827 else
6828 return MB_ERR_INVALID_CHARS;
6829}
6830
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006832 * Decode a byte string from a Windows code page into unicode object in strict
6833 * mode.
6834 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006835 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6836 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006838static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006839decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006840 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006841 const char *in,
6842 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843{
Victor Stinner3a50e702011-10-18 21:21:00 +02006844 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006845 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847
6848 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006849 assert(insize > 0);
6850 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6851 if (outsize <= 0)
6852 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
6854 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006856 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006857 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 if (*v == NULL)
6859 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006860 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861 }
6862 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006865 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 }
6869
6870 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6872 if (outsize <= 0)
6873 goto error;
6874 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006875
Victor Stinner3a50e702011-10-18 21:21:00 +02006876error:
6877 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6878 return -2;
6879 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006880 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881}
6882
Victor Stinner3a50e702011-10-18 21:21:00 +02006883/*
6884 * Decode a byte string from a code page into unicode object with an error
6885 * handler.
6886 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006887 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 * UnicodeDecodeError exception and returns -1 on error.
6889 */
6890static int
6891decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 PyObject **v,
6893 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006894 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006895{
6896 const char *startin = in;
6897 const char *endin = in + size;
6898 const DWORD flags = decode_code_page_flags(code_page);
6899 /* Ideally, we should get reason from FormatMessage. This is the Windows
6900 2000 English version of the message. */
6901 const char *reason = "No mapping for the Unicode character exists "
6902 "in the target code page.";
6903 /* each step cannot decode more than 1 character, but a character can be
6904 represented as a surrogate pair */
6905 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006906 int insize;
6907 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 PyObject *errorHandler = NULL;
6909 PyObject *exc = NULL;
6910 PyObject *encoding_obj = NULL;
6911 char *encoding;
6912 DWORD err;
6913 int ret = -1;
6914
6915 assert(size > 0);
6916
6917 encoding = code_page_name(code_page, &encoding_obj);
6918 if (encoding == NULL)
6919 return -1;
6920
Victor Stinner7d00cc12014-03-17 23:08:06 +01006921 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006922 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6923 UnicodeDecodeError. */
6924 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6925 if (exc != NULL) {
6926 PyCodec_StrictErrors(exc);
6927 Py_CLEAR(exc);
6928 }
6929 goto error;
6930 }
6931
6932 if (*v == NULL) {
6933 /* Create unicode object */
6934 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6935 PyErr_NoMemory();
6936 goto error;
6937 }
Victor Stinnerab595942011-12-17 04:59:06 +01006938 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006939 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 if (*v == NULL)
6941 goto error;
6942 startout = PyUnicode_AS_UNICODE(*v);
6943 }
6944 else {
6945 /* Extend unicode object */
6946 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6947 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6948 PyErr_NoMemory();
6949 goto error;
6950 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006951 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006952 goto error;
6953 startout = PyUnicode_AS_UNICODE(*v) + n;
6954 }
6955
6956 /* Decode the byte string character per character */
6957 out = startout;
6958 while (in < endin)
6959 {
6960 /* Decode a character */
6961 insize = 1;
6962 do
6963 {
6964 outsize = MultiByteToWideChar(code_page, flags,
6965 in, insize,
6966 buffer, Py_ARRAY_LENGTH(buffer));
6967 if (outsize > 0)
6968 break;
6969 err = GetLastError();
6970 if (err != ERROR_NO_UNICODE_TRANSLATION
6971 && err != ERROR_INSUFFICIENT_BUFFER)
6972 {
6973 PyErr_SetFromWindowsErr(0);
6974 goto error;
6975 }
6976 insize++;
6977 }
6978 /* 4=maximum length of a UTF-8 sequence */
6979 while (insize <= 4 && (in + insize) <= endin);
6980
6981 if (outsize <= 0) {
6982 Py_ssize_t startinpos, endinpos, outpos;
6983
Victor Stinner7d00cc12014-03-17 23:08:06 +01006984 /* last character in partial decode? */
6985 if (in + insize >= endin && !final)
6986 break;
6987
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 startinpos = in - startin;
6989 endinpos = startinpos + 1;
6990 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006991 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 errors, &errorHandler,
6993 encoding, reason,
6994 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006995 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 {
6997 goto error;
6998 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006999 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 }
7001 else {
7002 in += insize;
7003 memcpy(out, buffer, outsize * sizeof(wchar_t));
7004 out += outsize;
7005 }
7006 }
7007
7008 /* write a NUL character at the end */
7009 *out = 0;
7010
7011 /* Extend unicode object */
7012 outsize = out - startout;
7013 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007014 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 goto error;
Victor Stinner7d00cc12014-03-17 23:08:06 +01007016 ret = in - startin;
Victor Stinner3a50e702011-10-18 21:21:00 +02007017
7018error:
7019 Py_XDECREF(encoding_obj);
7020 Py_XDECREF(errorHandler);
7021 Py_XDECREF(exc);
7022 return ret;
7023}
7024
Victor Stinner3a50e702011-10-18 21:21:00 +02007025static PyObject *
7026decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007027 const char *s, Py_ssize_t size,
7028 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029{
Victor Stinner76a31a62011-11-04 00:05:13 +01007030 PyObject *v = NULL;
7031 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
Victor Stinner3a50e702011-10-18 21:21:00 +02007033 if (code_page < 0) {
7034 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7035 return NULL;
7036 }
7037
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040
Victor Stinner76a31a62011-11-04 00:05:13 +01007041 do
7042 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007043#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007044 if (size > INT_MAX) {
7045 chunk_size = INT_MAX;
7046 final = 0;
7047 done = 0;
7048 }
7049 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007051 {
7052 chunk_size = (int)size;
7053 final = (consumed == NULL);
7054 done = 1;
7055 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056
Victor Stinner76a31a62011-11-04 00:05:13 +01007057 if (chunk_size == 0 && done) {
7058 if (v != NULL)
7059 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007061 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062
Victor Stinner76a31a62011-11-04 00:05:13 +01007063 converted = decode_code_page_strict(code_page, &v,
7064 s, chunk_size);
7065 if (converted == -2)
7066 converted = decode_code_page_errors(code_page, &v,
7067 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007068 errors, final);
7069 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007070
7071 if (converted < 0) {
7072 Py_XDECREF(v);
7073 return NULL;
7074 }
7075
7076 if (consumed)
7077 *consumed += converted;
7078
7079 s += converted;
7080 size -= converted;
7081 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007083 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084}
7085
Alexander Belopolsky40018472011-02-26 01:02:56 +00007086PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007087PyUnicode_DecodeCodePageStateful(int code_page,
7088 const char *s,
7089 Py_ssize_t size,
7090 const char *errors,
7091 Py_ssize_t *consumed)
7092{
7093 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7094}
7095
7096PyObject *
7097PyUnicode_DecodeMBCSStateful(const char *s,
7098 Py_ssize_t size,
7099 const char *errors,
7100 Py_ssize_t *consumed)
7101{
7102 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7103}
7104
7105PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007106PyUnicode_DecodeMBCS(const char *s,
7107 Py_ssize_t size,
7108 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007109{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7111}
7112
Victor Stinner3a50e702011-10-18 21:21:00 +02007113static DWORD
7114encode_code_page_flags(UINT code_page, const char *errors)
7115{
7116 if (code_page == CP_UTF8) {
7117 if (winver.dwMajorVersion >= 6)
7118 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7119 and later */
7120 return WC_ERR_INVALID_CHARS;
7121 else
7122 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7123 return 0;
7124 }
7125 else if (code_page == CP_UTF7) {
7126 /* CP_UTF7 only supports flags=0 */
7127 return 0;
7128 }
7129 else {
7130 if (errors != NULL && strcmp(errors, "replace") == 0)
7131 return 0;
7132 else
7133 return WC_NO_BEST_FIT_CHARS;
7134 }
7135}
7136
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 * Encode a Unicode string to a Windows code page into a byte string in strict
7139 * mode.
7140 *
7141 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007142 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007144static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007145encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007146 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148{
Victor Stinner554f3f02010-06-16 23:33:54 +00007149 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 BOOL *pusedDefaultChar = &usedDefaultChar;
7151 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007152 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007153 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007154 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 const DWORD flags = encode_code_page_flags(code_page, NULL);
7156 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007157 /* Create a substring so that we can get the UTF-16 representation
7158 of just the slice under consideration. */
7159 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160
Martin v. Löwis3d325192011-11-04 18:23:06 +01007161 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007162
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007164 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007166 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007167
Victor Stinner2fc507f2011-11-04 20:06:39 +01007168 substring = PyUnicode_Substring(unicode, offset, offset+len);
7169 if (substring == NULL)
7170 return -1;
7171 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7172 if (p == NULL) {
7173 Py_DECREF(substring);
7174 return -1;
7175 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007176 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007177
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007178 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007180 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 NULL, 0,
7182 NULL, pusedDefaultChar);
7183 if (outsize <= 0)
7184 goto error;
7185 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007186 if (pusedDefaultChar && *pusedDefaultChar) {
7187 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007189 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007190
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194 if (*outbytes == NULL) {
7195 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007197 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199 }
7200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 const Py_ssize_t n = PyBytes_Size(*outbytes);
7203 if (outsize > PY_SSIZE_T_MAX - n) {
7204 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007208 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7209 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007211 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213 }
7214
7215 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007217 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 out, outsize,
7219 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 if (outsize <= 0)
7222 goto error;
7223 if (pusedDefaultChar && *pusedDefaultChar)
7224 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007225 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007226
Victor Stinner3a50e702011-10-18 21:21:00 +02007227error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007228 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7230 return -2;
7231 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007232 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007233}
7234
Victor Stinner3a50e702011-10-18 21:21:00 +02007235/*
7236 * Encode a Unicode string to a Windows code page into a byte string using a
7237 * error handler.
7238 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007239 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 * -1 on other error.
7241 */
7242static int
7243encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007244 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007245 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007246{
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007248 Py_ssize_t pos = unicode_offset;
7249 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 /* Ideally, we should get reason from FormatMessage. This is the Windows
7251 2000 English version of the message. */
7252 const char *reason = "invalid character";
7253 /* 4=maximum length of a UTF-8 sequence */
7254 char buffer[4];
7255 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7256 Py_ssize_t outsize;
7257 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 PyObject *errorHandler = NULL;
7259 PyObject *exc = NULL;
7260 PyObject *encoding_obj = NULL;
7261 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007262 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 PyObject *rep;
7264 int ret = -1;
7265
7266 assert(insize > 0);
7267
7268 encoding = code_page_name(code_page, &encoding_obj);
7269 if (encoding == NULL)
7270 return -1;
7271
7272 if (errors == NULL || strcmp(errors, "strict") == 0) {
7273 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7274 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007275 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 if (exc != NULL) {
7277 PyCodec_StrictErrors(exc);
7278 Py_DECREF(exc);
7279 }
7280 Py_XDECREF(encoding_obj);
7281 return -1;
7282 }
7283
7284 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7285 pusedDefaultChar = &usedDefaultChar;
7286 else
7287 pusedDefaultChar = NULL;
7288
7289 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7290 PyErr_NoMemory();
7291 goto error;
7292 }
7293 outsize = insize * Py_ARRAY_LENGTH(buffer);
7294
7295 if (*outbytes == NULL) {
7296 /* Create string object */
7297 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7298 if (*outbytes == NULL)
7299 goto error;
7300 out = PyBytes_AS_STRING(*outbytes);
7301 }
7302 else {
7303 /* Extend string object */
7304 Py_ssize_t n = PyBytes_Size(*outbytes);
7305 if (n > PY_SSIZE_T_MAX - outsize) {
7306 PyErr_NoMemory();
7307 goto error;
7308 }
7309 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7310 goto error;
7311 out = PyBytes_AS_STRING(*outbytes) + n;
7312 }
7313
7314 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007315 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007317 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7318 wchar_t chars[2];
7319 int charsize;
7320 if (ch < 0x10000) {
7321 chars[0] = (wchar_t)ch;
7322 charsize = 1;
7323 }
7324 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007325 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7326 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007327 charsize = 2;
7328 }
7329
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007331 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 buffer, Py_ARRAY_LENGTH(buffer),
7333 NULL, pusedDefaultChar);
7334 if (outsize > 0) {
7335 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7336 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007337 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 memcpy(out, buffer, outsize);
7339 out += outsize;
7340 continue;
7341 }
7342 }
7343 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7344 PyErr_SetFromWindowsErr(0);
7345 goto error;
7346 }
7347
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 rep = unicode_encode_call_errorhandler(
7349 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007350 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007351 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 if (rep == NULL)
7353 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007355
7356 if (PyBytes_Check(rep)) {
7357 outsize = PyBytes_GET_SIZE(rep);
7358 if (outsize != 1) {
7359 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7360 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7361 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7362 Py_DECREF(rep);
7363 goto error;
7364 }
7365 out = PyBytes_AS_STRING(*outbytes) + offset;
7366 }
7367 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7368 out += outsize;
7369 }
7370 else {
7371 Py_ssize_t i;
7372 enum PyUnicode_Kind kind;
7373 void *data;
7374
Benjamin Petersonbac79492012-01-14 13:34:47 -05007375 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 Py_DECREF(rep);
7377 goto error;
7378 }
7379
7380 outsize = PyUnicode_GET_LENGTH(rep);
7381 if (outsize != 1) {
7382 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7383 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7384 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7385 Py_DECREF(rep);
7386 goto error;
7387 }
7388 out = PyBytes_AS_STRING(*outbytes) + offset;
7389 }
7390 kind = PyUnicode_KIND(rep);
7391 data = PyUnicode_DATA(rep);
7392 for (i=0; i < outsize; i++) {
7393 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7394 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007395 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007396 encoding, unicode,
7397 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 "unable to encode error handler result to ASCII");
7399 Py_DECREF(rep);
7400 goto error;
7401 }
7402 *out = (unsigned char)ch;
7403 out++;
7404 }
7405 }
7406 Py_DECREF(rep);
7407 }
7408 /* write a NUL byte */
7409 *out = 0;
7410 outsize = out - PyBytes_AS_STRING(*outbytes);
7411 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7412 if (_PyBytes_Resize(outbytes, outsize) < 0)
7413 goto error;
7414 ret = 0;
7415
7416error:
7417 Py_XDECREF(encoding_obj);
7418 Py_XDECREF(errorHandler);
7419 Py_XDECREF(exc);
7420 return ret;
7421}
7422
Victor Stinner3a50e702011-10-18 21:21:00 +02007423static PyObject *
7424encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007425 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 const char *errors)
7427{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007428 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007430 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007431 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007432
Benjamin Petersonbac79492012-01-14 13:34:47 -05007433 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007434 return NULL;
7435 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007436
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 if (code_page < 0) {
7438 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7439 return NULL;
7440 }
7441
Martin v. Löwis3d325192011-11-04 18:23:06 +01007442 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 return PyBytes_FromStringAndSize(NULL, 0);
7444
Victor Stinner7581cef2011-11-03 22:32:33 +01007445 offset = 0;
7446 do
7447 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007449 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450 chunks. */
7451 if (len > INT_MAX/2) {
7452 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007453 done = 0;
7454 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007455 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007457 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007458 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007459 done = 1;
7460 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007463 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007464 errors);
7465 if (ret == -2)
7466 ret = encode_code_page_errors(code_page, &outbytes,
7467 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007468 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007469 if (ret < 0) {
7470 Py_XDECREF(outbytes);
7471 return NULL;
7472 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473
Victor Stinner7581cef2011-11-03 22:32:33 +01007474 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 return outbytes;
7479}
7480
7481PyObject *
7482PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7483 Py_ssize_t size,
7484 const char *errors)
7485{
Victor Stinner7581cef2011-11-03 22:32:33 +01007486 PyObject *unicode, *res;
7487 unicode = PyUnicode_FromUnicode(p, size);
7488 if (unicode == NULL)
7489 return NULL;
7490 res = encode_code_page(CP_ACP, unicode, errors);
7491 Py_DECREF(unicode);
7492 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007493}
7494
7495PyObject *
7496PyUnicode_EncodeCodePage(int code_page,
7497 PyObject *unicode,
7498 const char *errors)
7499{
Victor Stinner7581cef2011-11-03 22:32:33 +01007500 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007501}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007502
Alexander Belopolsky40018472011-02-26 01:02:56 +00007503PyObject *
7504PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007505{
7506 if (!PyUnicode_Check(unicode)) {
7507 PyErr_BadArgument();
7508 return NULL;
7509 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007510 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007511}
7512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007513#undef NEED_RETRY
7514
Victor Stinner99b95382011-07-04 14:23:54 +02007515#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007516
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517/* --- Character Mapping Codec -------------------------------------------- */
7518
Victor Stinnerfb161b12013-04-18 01:44:27 +02007519static int
7520charmap_decode_string(const char *s,
7521 Py_ssize_t size,
7522 PyObject *mapping,
7523 const char *errors,
7524 _PyUnicodeWriter *writer)
7525{
7526 const char *starts = s;
7527 const char *e;
7528 Py_ssize_t startinpos, endinpos;
7529 PyObject *errorHandler = NULL, *exc = NULL;
7530 Py_ssize_t maplen;
7531 enum PyUnicode_Kind mapkind;
7532 void *mapdata;
7533 Py_UCS4 x;
7534 unsigned char ch;
7535
7536 if (PyUnicode_READY(mapping) == -1)
7537 return -1;
7538
7539 maplen = PyUnicode_GET_LENGTH(mapping);
7540 mapdata = PyUnicode_DATA(mapping);
7541 mapkind = PyUnicode_KIND(mapping);
7542
7543 e = s + size;
7544
7545 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7546 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7547 * is disabled in encoding aliases, latin1 is preferred because
7548 * its implementation is faster. */
7549 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7550 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7551 Py_UCS4 maxchar = writer->maxchar;
7552
7553 assert (writer->kind == PyUnicode_1BYTE_KIND);
7554 while (s < e) {
7555 ch = *s;
7556 x = mapdata_ucs1[ch];
7557 if (x > maxchar) {
7558 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7559 goto onError;
7560 maxchar = writer->maxchar;
7561 outdata = (Py_UCS1 *)writer->data;
7562 }
7563 outdata[writer->pos] = x;
7564 writer->pos++;
7565 ++s;
7566 }
7567 return 0;
7568 }
7569
7570 while (s < e) {
7571 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7572 enum PyUnicode_Kind outkind = writer->kind;
7573 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7574 if (outkind == PyUnicode_1BYTE_KIND) {
7575 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7576 Py_UCS4 maxchar = writer->maxchar;
7577 while (s < e) {
7578 ch = *s;
7579 x = mapdata_ucs2[ch];
7580 if (x > maxchar)
7581 goto Error;
7582 outdata[writer->pos] = x;
7583 writer->pos++;
7584 ++s;
7585 }
7586 break;
7587 }
7588 else if (outkind == PyUnicode_2BYTE_KIND) {
7589 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7590 while (s < e) {
7591 ch = *s;
7592 x = mapdata_ucs2[ch];
7593 if (x == 0xFFFE)
7594 goto Error;
7595 outdata[writer->pos] = x;
7596 writer->pos++;
7597 ++s;
7598 }
7599 break;
7600 }
7601 }
7602 ch = *s;
7603
7604 if (ch < maplen)
7605 x = PyUnicode_READ(mapkind, mapdata, ch);
7606 else
7607 x = 0xfffe; /* invalid value */
7608Error:
7609 if (x == 0xfffe)
7610 {
7611 /* undefined mapping */
7612 startinpos = s-starts;
7613 endinpos = startinpos+1;
7614 if (unicode_decode_call_errorhandler_writer(
7615 errors, &errorHandler,
7616 "charmap", "character maps to <undefined>",
7617 &starts, &e, &startinpos, &endinpos, &exc, &s,
7618 writer)) {
7619 goto onError;
7620 }
7621 continue;
7622 }
7623
7624 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7625 goto onError;
7626 ++s;
7627 }
7628 Py_XDECREF(errorHandler);
7629 Py_XDECREF(exc);
7630 return 0;
7631
7632onError:
7633 Py_XDECREF(errorHandler);
7634 Py_XDECREF(exc);
7635 return -1;
7636}
7637
7638static int
7639charmap_decode_mapping(const char *s,
7640 Py_ssize_t size,
7641 PyObject *mapping,
7642 const char *errors,
7643 _PyUnicodeWriter *writer)
7644{
7645 const char *starts = s;
7646 const char *e;
7647 Py_ssize_t startinpos, endinpos;
7648 PyObject *errorHandler = NULL, *exc = NULL;
7649 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007650 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007651
7652 e = s + size;
7653
7654 while (s < e) {
7655 ch = *s;
7656
7657 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7658 key = PyLong_FromLong((long)ch);
7659 if (key == NULL)
7660 goto onError;
7661
7662 item = PyObject_GetItem(mapping, key);
7663 Py_DECREF(key);
7664 if (item == NULL) {
7665 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7666 /* No mapping found means: mapping is undefined. */
7667 PyErr_Clear();
7668 goto Undefined;
7669 } else
7670 goto onError;
7671 }
7672
7673 /* Apply mapping */
7674 if (item == Py_None)
7675 goto Undefined;
7676 if (PyLong_Check(item)) {
7677 long value = PyLong_AS_LONG(item);
7678 if (value == 0xFFFE)
7679 goto Undefined;
7680 if (value < 0 || value > MAX_UNICODE) {
7681 PyErr_Format(PyExc_TypeError,
7682 "character mapping must be in range(0x%lx)",
7683 (unsigned long)MAX_UNICODE + 1);
7684 goto onError;
7685 }
7686
7687 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7688 goto onError;
7689 }
7690 else if (PyUnicode_Check(item)) {
7691 if (PyUnicode_READY(item) == -1)
7692 goto onError;
7693 if (PyUnicode_GET_LENGTH(item) == 1) {
7694 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7695 if (value == 0xFFFE)
7696 goto Undefined;
7697 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7698 goto onError;
7699 }
7700 else {
7701 writer->overallocate = 1;
7702 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7703 goto onError;
7704 }
7705 }
7706 else {
7707 /* wrong return value */
7708 PyErr_SetString(PyExc_TypeError,
7709 "character mapping must return integer, None or str");
7710 goto onError;
7711 }
7712 Py_CLEAR(item);
7713 ++s;
7714 continue;
7715
7716Undefined:
7717 /* undefined mapping */
7718 Py_CLEAR(item);
7719 startinpos = s-starts;
7720 endinpos = startinpos+1;
7721 if (unicode_decode_call_errorhandler_writer(
7722 errors, &errorHandler,
7723 "charmap", "character maps to <undefined>",
7724 &starts, &e, &startinpos, &endinpos, &exc, &s,
7725 writer)) {
7726 goto onError;
7727 }
7728 }
7729 Py_XDECREF(errorHandler);
7730 Py_XDECREF(exc);
7731 return 0;
7732
7733onError:
7734 Py_XDECREF(item);
7735 Py_XDECREF(errorHandler);
7736 Py_XDECREF(exc);
7737 return -1;
7738}
7739
Alexander Belopolsky40018472011-02-26 01:02:56 +00007740PyObject *
7741PyUnicode_DecodeCharmap(const char *s,
7742 Py_ssize_t size,
7743 PyObject *mapping,
7744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007746 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007747
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 /* Default to Latin-1 */
7749 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007753 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007754 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007755 writer.min_length = size;
7756 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007758
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007759 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007760 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7761 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007762 }
7763 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007764 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7765 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007767 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007768
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007770 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 return NULL;
7772}
7773
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774/* Charmap encoding: the lookup table */
7775
Alexander Belopolsky40018472011-02-26 01:02:56 +00007776struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 PyObject_HEAD
7778 unsigned char level1[32];
7779 int count2, count3;
7780 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007781};
7782
7783static PyObject*
7784encoding_map_size(PyObject *obj, PyObject* args)
7785{
7786 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007789}
7790
7791static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 PyDoc_STR("Return the size (in bytes) of this object") },
7794 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007795};
7796
7797static void
7798encoding_map_dealloc(PyObject* o)
7799{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007800 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007801}
7802
7803static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007804 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 "EncodingMap", /*tp_name*/
7806 sizeof(struct encoding_map), /*tp_basicsize*/
7807 0, /*tp_itemsize*/
7808 /* methods */
7809 encoding_map_dealloc, /*tp_dealloc*/
7810 0, /*tp_print*/
7811 0, /*tp_getattr*/
7812 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007813 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 0, /*tp_repr*/
7815 0, /*tp_as_number*/
7816 0, /*tp_as_sequence*/
7817 0, /*tp_as_mapping*/
7818 0, /*tp_hash*/
7819 0, /*tp_call*/
7820 0, /*tp_str*/
7821 0, /*tp_getattro*/
7822 0, /*tp_setattro*/
7823 0, /*tp_as_buffer*/
7824 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7825 0, /*tp_doc*/
7826 0, /*tp_traverse*/
7827 0, /*tp_clear*/
7828 0, /*tp_richcompare*/
7829 0, /*tp_weaklistoffset*/
7830 0, /*tp_iter*/
7831 0, /*tp_iternext*/
7832 encoding_map_methods, /*tp_methods*/
7833 0, /*tp_members*/
7834 0, /*tp_getset*/
7835 0, /*tp_base*/
7836 0, /*tp_dict*/
7837 0, /*tp_descr_get*/
7838 0, /*tp_descr_set*/
7839 0, /*tp_dictoffset*/
7840 0, /*tp_init*/
7841 0, /*tp_alloc*/
7842 0, /*tp_new*/
7843 0, /*tp_free*/
7844 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845};
7846
7847PyObject*
7848PyUnicode_BuildEncodingMap(PyObject* string)
7849{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 PyObject *result;
7851 struct encoding_map *mresult;
7852 int i;
7853 int need_dict = 0;
7854 unsigned char level1[32];
7855 unsigned char level2[512];
7856 unsigned char *mlevel1, *mlevel2, *mlevel3;
7857 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007858 int kind;
7859 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007860 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007863 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007864 PyErr_BadArgument();
7865 return NULL;
7866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007867 kind = PyUnicode_KIND(string);
7868 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007869 length = PyUnicode_GET_LENGTH(string);
7870 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871 memset(level1, 0xFF, sizeof level1);
7872 memset(level2, 0xFF, sizeof level2);
7873
7874 /* If there isn't a one-to-one mapping of NULL to \0,
7875 or if there are non-BMP characters, we need to use
7876 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007879 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 ch = PyUnicode_READ(kind, data, i);
7882 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 need_dict = 1;
7884 break;
7885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007886 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 /* unmapped character */
7888 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 l1 = ch >> 11;
7890 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 if (level1[l1] == 0xFF)
7892 level1[l1] = count2++;
7893 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007894 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895 }
7896
7897 if (count2 >= 0xFF || count3 >= 0xFF)
7898 need_dict = 1;
7899
7900 if (need_dict) {
7901 PyObject *result = PyDict_New();
7902 PyObject *key, *value;
7903 if (!result)
7904 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007905 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007906 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007907 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007908 if (!key || !value)
7909 goto failed1;
7910 if (PyDict_SetItem(result, key, value) == -1)
7911 goto failed1;
7912 Py_DECREF(key);
7913 Py_DECREF(value);
7914 }
7915 return result;
7916 failed1:
7917 Py_XDECREF(key);
7918 Py_XDECREF(value);
7919 Py_DECREF(result);
7920 return NULL;
7921 }
7922
7923 /* Create a three-level trie */
7924 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7925 16*count2 + 128*count3 - 1);
7926 if (!result)
7927 return PyErr_NoMemory();
7928 PyObject_Init(result, &EncodingMapType);
7929 mresult = (struct encoding_map*)result;
7930 mresult->count2 = count2;
7931 mresult->count3 = count3;
7932 mlevel1 = mresult->level1;
7933 mlevel2 = mresult->level23;
7934 mlevel3 = mresult->level23 + 16*count2;
7935 memcpy(mlevel1, level1, 32);
7936 memset(mlevel2, 0xFF, 16*count2);
7937 memset(mlevel3, 0, 128*count3);
7938 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007939 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007940 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007941 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7942 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943 /* unmapped character */
7944 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007945 o1 = ch>>11;
7946 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 i2 = 16*mlevel1[o1] + o2;
7948 if (mlevel2[i2] == 0xFF)
7949 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007950 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007951 i3 = 128*mlevel2[i2] + o3;
7952 mlevel3[i3] = i;
7953 }
7954 return result;
7955}
7956
7957static int
Victor Stinner22168992011-11-20 17:09:18 +01007958encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959{
7960 struct encoding_map *map = (struct encoding_map*)mapping;
7961 int l1 = c>>11;
7962 int l2 = (c>>7) & 0xF;
7963 int l3 = c & 0x7F;
7964 int i;
7965
Victor Stinner22168992011-11-20 17:09:18 +01007966 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 if (c == 0)
7969 return 0;
7970 /* level 1*/
7971 i = map->level1[l1];
7972 if (i == 0xFF) {
7973 return -1;
7974 }
7975 /* level 2*/
7976 i = map->level23[16*i+l2];
7977 if (i == 0xFF) {
7978 return -1;
7979 }
7980 /* level 3 */
7981 i = map->level23[16*map->count2 + 128*i + l3];
7982 if (i == 0) {
7983 return -1;
7984 }
7985 return i;
7986}
7987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988/* Lookup the character ch in the mapping. If the character
7989 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007990 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007991static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007992charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993{
Christian Heimes217cfd12007-12-02 14:31:20 +00007994 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007995 PyObject *x;
7996
7997 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 x = PyObject_GetItem(mapping, w);
8000 Py_DECREF(w);
8001 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8003 /* No mapping found means: mapping is undefined. */
8004 PyErr_Clear();
8005 x = Py_None;
8006 Py_INCREF(x);
8007 return x;
8008 } else
8009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008011 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008013 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 long value = PyLong_AS_LONG(x);
8015 if (value < 0 || value > 255) {
8016 PyErr_SetString(PyExc_TypeError,
8017 "character mapping must be in range(256)");
8018 Py_DECREF(x);
8019 return NULL;
8020 }
8021 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008023 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 /* wrong return value */
8027 PyErr_Format(PyExc_TypeError,
8028 "character mapping must return integer, bytes or None, not %.400s",
8029 x->ob_type->tp_name);
8030 Py_DECREF(x);
8031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 }
8033}
8034
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008036charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8039 /* exponentially overallocate to minimize reallocations */
8040 if (requiredsize < 2*outsize)
8041 requiredsize = 2*outsize;
8042 if (_PyBytes_Resize(outobj, requiredsize))
8043 return -1;
8044 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045}
8046
Benjamin Peterson14339b62009-01-31 16:36:08 +00008047typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008049} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008050/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008051 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052 space is available. Return a new reference to the object that
8053 was put in the output buffer, or Py_None, if the mapping was undefined
8054 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008055 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008056static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008057charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008058 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008059{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060 PyObject *rep;
8061 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008062 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063
Christian Heimes90aa7642007-12-19 02:45:37 +00008064 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 if (res == -1)
8068 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 if (outsize<requiredsize)
8070 if (charmapencode_resize(outobj, outpos, requiredsize))
8071 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008072 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 outstart[(*outpos)++] = (char)res;
8074 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075 }
8076
8077 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 Py_DECREF(rep);
8082 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 if (PyLong_Check(rep)) {
8085 Py_ssize_t requiredsize = *outpos+1;
8086 if (outsize<requiredsize)
8087 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8088 Py_DECREF(rep);
8089 return enc_EXCEPTION;
8090 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008091 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 else {
8095 const char *repchars = PyBytes_AS_STRING(rep);
8096 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8097 Py_ssize_t requiredsize = *outpos+repsize;
8098 if (outsize<requiredsize)
8099 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8100 Py_DECREF(rep);
8101 return enc_EXCEPTION;
8102 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008103 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 memcpy(outstart + *outpos, repchars, repsize);
8105 *outpos += repsize;
8106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008107 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 Py_DECREF(rep);
8109 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110}
8111
8112/* handle an error in PyUnicode_EncodeCharmap
8113 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008114static int
8115charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008116 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008118 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008119 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120{
8121 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008122 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008123 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008124 enum PyUnicode_Kind kind;
8125 void *data;
8126 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 Py_ssize_t collstartpos = *inpos;
8129 Py_ssize_t collendpos = *inpos+1;
8130 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 char *encoding = "charmap";
8132 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008135 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136
Benjamin Petersonbac79492012-01-14 13:34:47 -05008137 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008138 return -1;
8139 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 /* find all unencodable characters */
8141 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008143 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008144 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008145 val = encoding_map_lookup(ch, mapping);
8146 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 break;
8148 ++collendpos;
8149 continue;
8150 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008151
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008152 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8153 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 if (rep==NULL)
8155 return -1;
8156 else if (rep!=Py_None) {
8157 Py_DECREF(rep);
8158 break;
8159 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 }
8163 /* cache callback name lookup
8164 * (if not done yet, i.e. it's the first error) */
8165 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 if ((errors==NULL) || (!strcmp(errors, "strict")))
8167 *known_errorHandler = 1;
8168 else if (!strcmp(errors, "replace"))
8169 *known_errorHandler = 2;
8170 else if (!strcmp(errors, "ignore"))
8171 *known_errorHandler = 3;
8172 else if (!strcmp(errors, "xmlcharrefreplace"))
8173 *known_errorHandler = 4;
8174 else
8175 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008176 }
8177 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008179 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 return -1;
8181 case 2: /* replace */
8182 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 x = charmapencode_output('?', mapping, res, respos);
8184 if (x==enc_EXCEPTION) {
8185 return -1;
8186 }
8187 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008188 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 return -1;
8190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 }
8192 /* fall through */
8193 case 3: /* ignore */
8194 *inpos = collendpos;
8195 break;
8196 case 4: /* xmlcharrefreplace */
8197 /* generate replacement (temporarily (mis)uses p) */
8198 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 char buffer[2+29+1+1];
8200 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008201 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 for (cp = buffer; *cp; ++cp) {
8203 x = charmapencode_output(*cp, mapping, res, respos);
8204 if (x==enc_EXCEPTION)
8205 return -1;
8206 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008207 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return -1;
8209 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 }
8211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 *inpos = collendpos;
8213 break;
8214 default:
8215 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008216 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008220 if (PyBytes_Check(repunicode)) {
8221 /* Directly copy bytes result to output. */
8222 Py_ssize_t outsize = PyBytes_Size(*res);
8223 Py_ssize_t requiredsize;
8224 repsize = PyBytes_Size(repunicode);
8225 requiredsize = *respos + repsize;
8226 if (requiredsize > outsize)
8227 /* Make room for all additional bytes. */
8228 if (charmapencode_resize(res, respos, requiredsize)) {
8229 Py_DECREF(repunicode);
8230 return -1;
8231 }
8232 memcpy(PyBytes_AsString(*res) + *respos,
8233 PyBytes_AsString(repunicode), repsize);
8234 *respos += repsize;
8235 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008236 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008237 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008238 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008239 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008240 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008241 Py_DECREF(repunicode);
8242 return -1;
8243 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008244 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008245 data = PyUnicode_DATA(repunicode);
8246 kind = PyUnicode_KIND(repunicode);
8247 for (index = 0; index < repsize; index++) {
8248 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8249 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008251 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 return -1;
8253 }
8254 else if (x==enc_FAILED) {
8255 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008256 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 return -1;
8258 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008259 }
8260 *inpos = newpos;
8261 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 }
8263 return 0;
8264}
8265
Alexander Belopolsky40018472011-02-26 01:02:56 +00008266PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008267_PyUnicode_EncodeCharmap(PyObject *unicode,
8268 PyObject *mapping,
8269 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 /* output object */
8272 PyObject *res = NULL;
8273 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008274 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008275 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008277 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 PyObject *errorHandler = NULL;
8279 PyObject *exc = NULL;
8280 /* the following variable is used for caching string comparisons
8281 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8282 * 3=ignore, 4=xmlcharrefreplace */
8283 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008284 void *data;
8285 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286
Benjamin Petersonbac79492012-01-14 13:34:47 -05008287 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008288 return NULL;
8289 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008290 data = PyUnicode_DATA(unicode);
8291 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008292
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 /* Default to Latin-1 */
8294 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008295 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 /* allocate enough for a simple encoding without
8298 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008299 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 if (res == NULL)
8301 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008302 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008306 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 if (x==enc_EXCEPTION) /* error */
8310 goto onError;
8311 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008312 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 &exc,
8314 &known_errorHandler, &errorHandler, errors,
8315 &res, &respos)) {
8316 goto onError;
8317 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008318 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 else
8320 /* done with this character => adjust input position */
8321 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008325 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008326 if (_PyBytes_Resize(&res, respos) < 0)
8327 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 Py_XDECREF(exc);
8330 Py_XDECREF(errorHandler);
8331 return res;
8332
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334 Py_XDECREF(res);
8335 Py_XDECREF(exc);
8336 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 return NULL;
8338}
8339
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008340/* Deprecated */
8341PyObject *
8342PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8343 Py_ssize_t size,
8344 PyObject *mapping,
8345 const char *errors)
8346{
8347 PyObject *result;
8348 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8349 if (unicode == NULL)
8350 return NULL;
8351 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8352 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008353 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008354}
8355
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356PyObject *
8357PyUnicode_AsCharmapString(PyObject *unicode,
8358 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359{
8360 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 PyErr_BadArgument();
8362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008364 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365}
8366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368static void
8369make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008371 Py_ssize_t startpos, Py_ssize_t endpos,
8372 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 *exceptionObject = _PyUnicodeTranslateError_Create(
8376 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 }
8378 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8380 goto onError;
8381 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8382 goto onError;
8383 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8384 goto onError;
8385 return;
8386 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008387 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 }
8389}
8390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391/* error handling callback helper:
8392 build arguments, call the callback and check the arguments,
8393 put the result into newpos and return the replacement string, which
8394 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395static PyObject *
8396unicode_translate_call_errorhandler(const char *errors,
8397 PyObject **errorHandler,
8398 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008400 Py_ssize_t startpos, Py_ssize_t endpos,
8401 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008403 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008405 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 PyObject *restuple;
8407 PyObject *resunicode;
8408
8409 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 }
8414
8415 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419
8420 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008425 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 Py_DECREF(restuple);
8427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 }
8429 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 &resunicode, &i_newpos)) {
8431 Py_DECREF(restuple);
8432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008434 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 else
8437 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8440 Py_DECREF(restuple);
8441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 Py_INCREF(resunicode);
8444 Py_DECREF(restuple);
8445 return resunicode;
8446}
8447
8448/* Lookup the character ch in the mapping and put the result in result,
8449 which must be decrefed by the caller.
8450 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008451static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453{
Christian Heimes217cfd12007-12-02 14:31:20 +00008454 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455 PyObject *x;
8456
8457 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 x = PyObject_GetItem(mapping, w);
8460 Py_DECREF(w);
8461 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8463 /* No mapping found means: use 1:1 mapping. */
8464 PyErr_Clear();
8465 *result = NULL;
8466 return 0;
8467 } else
8468 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 }
8470 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 *result = x;
8472 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008474 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 long value = PyLong_AS_LONG(x);
8476 long max = PyUnicode_GetMax();
8477 if (value < 0 || value > max) {
8478 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008479 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 Py_DECREF(x);
8481 return -1;
8482 }
8483 *result = x;
8484 return 0;
8485 }
8486 else if (PyUnicode_Check(x)) {
8487 *result = x;
8488 return 0;
8489 }
8490 else {
8491 /* wrong return value */
8492 PyErr_SetString(PyExc_TypeError,
8493 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 Py_DECREF(x);
8495 return -1;
8496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497}
Victor Stinner1194ea02014-04-04 19:37:40 +02008498
8499/* lookup the character, write the result into the writer.
8500 Return 1 if the result was written into the writer, return 0 if the mapping
8501 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008503charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8504 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505{
Victor Stinner1194ea02014-04-04 19:37:40 +02008506 PyObject *item;
8507
8508 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008510
8511 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008513 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008516 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008518
8519 if (item == Py_None) {
8520 Py_DECREF(item);
8521 return 0;
8522 }
8523
8524 if (PyLong_Check(item)) {
8525 Py_UCS4 ch = (Py_UCS4)PyLong_AS_LONG(item);
8526 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8527 Py_DECREF(item);
8528 return -1;
8529 }
8530 Py_DECREF(item);
8531 return 1;
8532 }
8533
8534 if (!PyUnicode_Check(item)) {
8535 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008537 }
8538
8539 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8540 Py_DECREF(item);
8541 return -1;
8542 }
8543
8544 Py_DECREF(item);
8545 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546}
8547
Victor Stinner89a76ab2014-04-05 11:44:04 +02008548static int
8549unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8550 Py_UCS1 *translate)
8551{
8552 PyObject *item;
8553 int ret = 0;
8554
8555 item = NULL;
8556 if (charmaptranslate_lookup(ch, mapping, &item)) {
8557 return -1;
8558 }
8559
8560 if (item == Py_None) {
8561 /* deletion: skip fast translate */
8562 goto exit;
8563 }
8564
8565 if (item == NULL) {
8566 /* not found => default to 1:1 mapping */
8567 translate[ch] = ch;
8568 return 1;
8569 }
8570
8571 if (PyLong_Check(item)) {
8572 long replace = (Py_UCS4)PyLong_AS_LONG(item);
8573 if (replace == -1) {
8574 Py_DECREF(item);
8575 return -1;
8576 }
8577 if (replace < 0 || 127 < replace) {
8578 /* invalid character or character outside ASCII:
8579 skip the fast translate */
8580 goto exit;
8581 }
8582 translate[ch] = (Py_UCS1)replace;
8583 }
8584 else if (PyUnicode_Check(item)) {
8585 Py_UCS4 replace;
8586
8587 if (PyUnicode_READY(item) == -1) {
8588 Py_DECREF(item);
8589 return -1;
8590 }
8591 if (PyUnicode_GET_LENGTH(item) != 1)
8592 goto exit;
8593
8594 replace = PyUnicode_READ_CHAR(item, 0);
8595 if (replace > 127)
8596 goto exit;
8597 translate[ch] = (Py_UCS1)replace;
8598 }
8599 else {
8600 /* not a long or unicode */
8601 goto exit;
8602 }
8603 Py_DECREF(item);
8604 item = NULL;
8605 ret = 1;
8606
8607exit:
8608 Py_XDECREF(item);
8609 return ret;
8610}
8611
8612/* Fast path for ascii => ascii translation. Return 1 if the whole string
8613 was translated into writer, return 0 if the input string was partially
8614 translated into writer, raise an exception and return -1 on error. */
8615static int
8616unicode_fast_translate(PyObject *input, PyObject *mapping,
8617 _PyUnicodeWriter *writer)
8618{
8619 Py_UCS1 translate[128], ch, ch2;
8620 Py_ssize_t len;
8621 Py_UCS1 *in, *end, *out;
8622 int res;
8623
8624 if (PyUnicode_READY(input) == -1)
8625 return -1;
8626 if (!PyUnicode_IS_ASCII(input))
8627 return 0;
8628 len = PyUnicode_GET_LENGTH(input);
8629
8630 memset(translate, 0xff, 128);
8631
8632 in = PyUnicode_1BYTE_DATA(input);
8633 end = in + len;
8634
8635 assert(PyUnicode_IS_ASCII(writer->buffer));
8636 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8637 out = PyUnicode_1BYTE_DATA(writer->buffer);
8638
8639 for (; in < end; in++, out++) {
8640 ch = *in;
8641 ch2 = translate[ch];
8642 if (ch2 == 0xff) {
8643 res = unicode_fast_translate_lookup(mapping, ch, translate);
8644 if (res < 0)
8645 return -1;
8646 if (res == 0) {
8647 writer->pos = in - PyUnicode_1BYTE_DATA(input);
8648 return 0;
8649 }
8650 ch2 = translate[ch];
8651 }
8652 *out = ch2;
8653 }
8654 writer->pos = len;
8655 return 1;
8656}
8657
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659_PyUnicode_TranslateCharmap(PyObject *input,
8660 PyObject *mapping,
8661 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008664 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 Py_ssize_t size, i;
8666 int kind;
8667 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008668 _PyUnicodeWriter writer;
8669 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 char *reason = "character maps to <undefined>";
8671 PyObject *errorHandler = NULL;
8672 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008673 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008674 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 PyErr_BadArgument();
8678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 if (PyUnicode_READY(input) == -1)
8682 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008683 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 kind = PyUnicode_KIND(input);
8685 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686
8687 if (size == 0) {
8688 Py_INCREF(input);
8689 return input;
8690 }
8691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 /* allocate enough for a simple 1:1 translation without
8693 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008694 _PyUnicodeWriter_Init(&writer);
8695 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697
Victor Stinner89a76ab2014-04-05 11:44:04 +02008698 res = unicode_fast_translate(input, mapping, &writer);
8699 if (res < 0) {
8700 _PyUnicodeWriter_Dealloc(&writer);
8701 return NULL;
8702 }
8703 if (res == 1)
8704 return _PyUnicodeWriter_Finish(&writer);
8705
Victor Stinner1194ea02014-04-04 19:37:40 +02008706 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8707
Victor Stinner89a76ab2014-04-05 11:44:04 +02008708 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008711 int translate;
8712 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8713 Py_ssize_t newpos;
8714 /* startpos for collecting untranslatable chars */
8715 Py_ssize_t collstart;
8716 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008717 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718
Victor Stinner1194ea02014-04-04 19:37:40 +02008719 ch = PyUnicode_READ(kind, data, i);
8720 translate = charmaptranslate_output(ch, mapping, &writer);
8721 if (translate < 0)
8722 goto onError;
8723
8724 if (translate != 0) {
8725 /* it worked => adjust input pointer */
8726 ++i;
8727 continue;
8728 }
8729
8730 /* untranslatable character */
8731 collstart = i;
8732 collend = i+1;
8733
8734 /* find all untranslatable characters */
8735 while (collend < size) {
8736 PyObject *x;
8737 ch = PyUnicode_READ(kind, data, collend);
8738 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008739 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008740 Py_XDECREF(x);
8741 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008743 ++collend;
8744 }
8745
8746 if (ignore) {
8747 i = collend;
8748 }
8749 else {
8750 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8751 reason, input, &exc,
8752 collstart, collend, &newpos);
8753 if (repunicode == NULL)
8754 goto onError;
8755 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008757 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008758 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008759 Py_DECREF(repunicode);
8760 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008761 }
8762 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763 Py_XDECREF(exc);
8764 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008765 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008768 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769 Py_XDECREF(exc);
8770 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 return NULL;
8772}
8773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774/* Deprecated. Use PyUnicode_Translate instead. */
8775PyObject *
8776PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8777 Py_ssize_t size,
8778 PyObject *mapping,
8779 const char *errors)
8780{
Christian Heimes5f520f42012-09-11 14:03:25 +02008781 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8783 if (!unicode)
8784 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008785 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8786 Py_DECREF(unicode);
8787 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788}
8789
Alexander Belopolsky40018472011-02-26 01:02:56 +00008790PyObject *
8791PyUnicode_Translate(PyObject *str,
8792 PyObject *mapping,
8793 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794{
8795 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008796
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 str = PyUnicode_FromObject(str);
8798 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008799 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 Py_DECREF(str);
8802 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803}
Tim Petersced69f82003-09-16 20:30:58 +00008804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008806fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008807{
8808 /* No need to call PyUnicode_READY(self) because this function is only
8809 called as a callback from fixup() which does it already. */
8810 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8811 const int kind = PyUnicode_KIND(self);
8812 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008813 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008814 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 Py_ssize_t i;
8816
8817 for (i = 0; i < len; ++i) {
8818 ch = PyUnicode_READ(kind, data, i);
8819 fixed = 0;
8820 if (ch > 127) {
8821 if (Py_UNICODE_ISSPACE(ch))
8822 fixed = ' ';
8823 else {
8824 const int decimal = Py_UNICODE_TODECIMAL(ch);
8825 if (decimal >= 0)
8826 fixed = '0' + decimal;
8827 }
8828 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008829 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008830 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 PyUnicode_WRITE(kind, data, i, fixed);
8832 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008833 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008834 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 }
8837
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008838 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839}
8840
8841PyObject *
8842_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8843{
8844 if (!PyUnicode_Check(unicode)) {
8845 PyErr_BadInternalCall();
8846 return NULL;
8847 }
8848 if (PyUnicode_READY(unicode) == -1)
8849 return NULL;
8850 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8851 /* If the string is already ASCII, just return the same string */
8852 Py_INCREF(unicode);
8853 return unicode;
8854 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008855 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856}
8857
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008858PyObject *
8859PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8860 Py_ssize_t length)
8861{
Victor Stinnerf0124502011-11-21 23:12:56 +01008862 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008863 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008864 Py_UCS4 maxchar;
8865 enum PyUnicode_Kind kind;
8866 void *data;
8867
Victor Stinner99d7ad02012-02-22 13:37:39 +01008868 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008869 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008870 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008871 if (ch > 127) {
8872 int decimal = Py_UNICODE_TODECIMAL(ch);
8873 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008874 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008875 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008876 }
8877 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008878
8879 /* Copy to a new string */
8880 decimal = PyUnicode_New(length, maxchar);
8881 if (decimal == NULL)
8882 return decimal;
8883 kind = PyUnicode_KIND(decimal);
8884 data = PyUnicode_DATA(decimal);
8885 /* Iterate over code points */
8886 for (i = 0; i < length; i++) {
8887 Py_UNICODE ch = s[i];
8888 if (ch > 127) {
8889 int decimal = Py_UNICODE_TODECIMAL(ch);
8890 if (decimal >= 0)
8891 ch = '0' + decimal;
8892 }
8893 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008895 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008896}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008897/* --- Decimal Encoder ---------------------------------------------------- */
8898
Alexander Belopolsky40018472011-02-26 01:02:56 +00008899int
8900PyUnicode_EncodeDecimal(Py_UNICODE *s,
8901 Py_ssize_t length,
8902 char *output,
8903 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008904{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008905 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008906 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008907 enum PyUnicode_Kind kind;
8908 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008909
8910 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 PyErr_BadArgument();
8912 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008913 }
8914
Victor Stinner42bf7752011-11-21 22:52:58 +01008915 unicode = PyUnicode_FromUnicode(s, length);
8916 if (unicode == NULL)
8917 return -1;
8918
Benjamin Petersonbac79492012-01-14 13:34:47 -05008919 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008920 Py_DECREF(unicode);
8921 return -1;
8922 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008923 kind = PyUnicode_KIND(unicode);
8924 data = PyUnicode_DATA(unicode);
8925
Victor Stinnerb84d7232011-11-22 01:50:07 +01008926 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008927 PyObject *exc;
8928 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008930 Py_ssize_t startpos;
8931
8932 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008933
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008935 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008936 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008938 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 decimal = Py_UNICODE_TODECIMAL(ch);
8940 if (decimal >= 0) {
8941 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008942 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 continue;
8944 }
8945 if (0 < ch && ch < 256) {
8946 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008947 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 continue;
8949 }
Victor Stinner6345be92011-11-25 20:09:01 +01008950
Victor Stinner42bf7752011-11-21 22:52:58 +01008951 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008952 exc = NULL;
8953 raise_encode_exception(&exc, "decimal", unicode,
8954 startpos, startpos+1,
8955 "invalid decimal Unicode string");
8956 Py_XDECREF(exc);
8957 Py_DECREF(unicode);
8958 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008959 }
8960 /* 0-terminate the output string */
8961 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008962 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008963 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008964}
8965
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966/* --- Helpers ------------------------------------------------------------ */
8967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008969any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 Py_ssize_t start,
8971 Py_ssize_t end)
8972{
8973 int kind1, kind2, kind;
8974 void *buf1, *buf2;
8975 Py_ssize_t len1, len2, result;
8976
8977 kind1 = PyUnicode_KIND(s1);
8978 kind2 = PyUnicode_KIND(s2);
8979 kind = kind1 > kind2 ? kind1 : kind2;
8980 buf1 = PyUnicode_DATA(s1);
8981 buf2 = PyUnicode_DATA(s2);
8982 if (kind1 != kind)
8983 buf1 = _PyUnicode_AsKind(s1, kind);
8984 if (!buf1)
8985 return -2;
8986 if (kind2 != kind)
8987 buf2 = _PyUnicode_AsKind(s2, kind);
8988 if (!buf2) {
8989 if (kind1 != kind) PyMem_Free(buf1);
8990 return -2;
8991 }
8992 len1 = PyUnicode_GET_LENGTH(s1);
8993 len2 = PyUnicode_GET_LENGTH(s2);
8994
Victor Stinner794d5672011-10-10 03:21:36 +02008995 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008996 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008997 case PyUnicode_1BYTE_KIND:
8998 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8999 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9000 else
9001 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9002 break;
9003 case PyUnicode_2BYTE_KIND:
9004 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9005 break;
9006 case PyUnicode_4BYTE_KIND:
9007 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9008 break;
9009 default:
9010 assert(0); result = -2;
9011 }
9012 }
9013 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009014 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009015 case PyUnicode_1BYTE_KIND:
9016 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9017 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9018 else
9019 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9020 break;
9021 case PyUnicode_2BYTE_KIND:
9022 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9023 break;
9024 case PyUnicode_4BYTE_KIND:
9025 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9026 break;
9027 default:
9028 assert(0); result = -2;
9029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 }
9031
9032 if (kind1 != kind)
9033 PyMem_Free(buf1);
9034 if (kind2 != kind)
9035 PyMem_Free(buf2);
9036
9037 return result;
9038}
9039
9040Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009041_PyUnicode_InsertThousandsGrouping(
9042 PyObject *unicode, Py_ssize_t index,
9043 Py_ssize_t n_buffer,
9044 void *digits, Py_ssize_t n_digits,
9045 Py_ssize_t min_width,
9046 const char *grouping, PyObject *thousands_sep,
9047 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048{
Victor Stinner41a863c2012-02-24 00:37:51 +01009049 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009050 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009051 Py_ssize_t thousands_sep_len;
9052 Py_ssize_t len;
9053
9054 if (unicode != NULL) {
9055 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009056 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009057 }
9058 else {
9059 kind = PyUnicode_1BYTE_KIND;
9060 data = NULL;
9061 }
9062 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9063 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9064 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9065 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009066 if (thousands_sep_kind < kind) {
9067 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9068 if (!thousands_sep_data)
9069 return -1;
9070 }
9071 else {
9072 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9073 if (!data)
9074 return -1;
9075 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009076 }
9077
Benjamin Petersonead6b532011-12-20 17:23:42 -06009078 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009080 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009081 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009082 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009083 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009084 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009085 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009086 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009087 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009088 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009089 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009090 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009092 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009093 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009094 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009095 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009096 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009098 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009099 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009100 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009101 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009102 break;
9103 default:
9104 assert(0);
9105 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009107 if (unicode != NULL && thousands_sep_kind != kind) {
9108 if (thousands_sep_kind < kind)
9109 PyMem_Free(thousands_sep_data);
9110 else
9111 PyMem_Free(data);
9112 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009113 if (unicode == NULL) {
9114 *maxchar = 127;
9115 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009116 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009117 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009118 }
9119 }
9120 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121}
9122
9123
Thomas Wouters477c8d52006-05-27 19:21:47 +00009124/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009125#define ADJUST_INDICES(start, end, len) \
9126 if (end > len) \
9127 end = len; \
9128 else if (end < 0) { \
9129 end += len; \
9130 if (end < 0) \
9131 end = 0; \
9132 } \
9133 if (start < 0) { \
9134 start += len; \
9135 if (start < 0) \
9136 start = 0; \
9137 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009138
Alexander Belopolsky40018472011-02-26 01:02:56 +00009139Py_ssize_t
9140PyUnicode_Count(PyObject *str,
9141 PyObject *substr,
9142 Py_ssize_t start,
9143 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009145 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009146 PyObject* str_obj;
9147 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 int kind1, kind2, kind;
9149 void *buf1 = NULL, *buf2 = NULL;
9150 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009151
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009152 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009153 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009155 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009156 if (!sub_obj) {
9157 Py_DECREF(str_obj);
9158 return -1;
9159 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009160 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009161 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009162 Py_DECREF(str_obj);
9163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164 }
Tim Petersced69f82003-09-16 20:30:58 +00009165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 kind1 = PyUnicode_KIND(str_obj);
9167 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009168 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009171 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009172 if (kind2 > kind) {
9173 Py_DECREF(sub_obj);
9174 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009175 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009176 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009177 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 if (!buf2)
9180 goto onError;
9181 len1 = PyUnicode_GET_LENGTH(str_obj);
9182 len2 = PyUnicode_GET_LENGTH(sub_obj);
9183
9184 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009185 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009187 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9188 result = asciilib_count(
9189 ((Py_UCS1*)buf1) + start, end - start,
9190 buf2, len2, PY_SSIZE_T_MAX
9191 );
9192 else
9193 result = ucs1lib_count(
9194 ((Py_UCS1*)buf1) + start, end - start,
9195 buf2, len2, PY_SSIZE_T_MAX
9196 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 break;
9198 case PyUnicode_2BYTE_KIND:
9199 result = ucs2lib_count(
9200 ((Py_UCS2*)buf1) + start, end - start,
9201 buf2, len2, PY_SSIZE_T_MAX
9202 );
9203 break;
9204 case PyUnicode_4BYTE_KIND:
9205 result = ucs4lib_count(
9206 ((Py_UCS4*)buf1) + start, end - start,
9207 buf2, len2, PY_SSIZE_T_MAX
9208 );
9209 break;
9210 default:
9211 assert(0); result = 0;
9212 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009213
9214 Py_DECREF(sub_obj);
9215 Py_DECREF(str_obj);
9216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 if (kind2 != kind)
9218 PyMem_Free(buf2);
9219
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 onError:
9222 Py_DECREF(sub_obj);
9223 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 if (kind2 != kind && buf2)
9225 PyMem_Free(buf2);
9226 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227}
9228
Alexander Belopolsky40018472011-02-26 01:02:56 +00009229Py_ssize_t
9230PyUnicode_Find(PyObject *str,
9231 PyObject *sub,
9232 Py_ssize_t start,
9233 Py_ssize_t end,
9234 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009236 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009237
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009239 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009240 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009241 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009242 if (!sub) {
9243 Py_DECREF(str);
9244 return -2;
9245 }
9246 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9247 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009248 Py_DECREF(str);
9249 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250 }
Tim Petersced69f82003-09-16 20:30:58 +00009251
Victor Stinner794d5672011-10-10 03:21:36 +02009252 result = any_find_slice(direction,
9253 str, sub, start, end
9254 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009255
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009257 Py_DECREF(sub);
9258
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259 return result;
9260}
9261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262Py_ssize_t
9263PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9264 Py_ssize_t start, Py_ssize_t end,
9265 int direction)
9266{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009268 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 if (PyUnicode_READY(str) == -1)
9270 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009271 if (start < 0 || end < 0) {
9272 PyErr_SetString(PyExc_IndexError, "string index out of range");
9273 return -2;
9274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 if (end > PyUnicode_GET_LENGTH(str))
9276 end = PyUnicode_GET_LENGTH(str);
9277 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009278 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9279 kind, end-start, ch, direction);
9280 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009282 else
9283 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284}
9285
Alexander Belopolsky40018472011-02-26 01:02:56 +00009286static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009287tailmatch(PyObject *self,
9288 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009289 Py_ssize_t start,
9290 Py_ssize_t end,
9291 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 int kind_self;
9294 int kind_sub;
9295 void *data_self;
9296 void *data_sub;
9297 Py_ssize_t offset;
9298 Py_ssize_t i;
9299 Py_ssize_t end_sub;
9300
9301 if (PyUnicode_READY(self) == -1 ||
9302 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009303 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304
9305 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 return 1;
9307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9309 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009311 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 kind_self = PyUnicode_KIND(self);
9314 data_self = PyUnicode_DATA(self);
9315 kind_sub = PyUnicode_KIND(substring);
9316 data_sub = PyUnicode_DATA(substring);
9317 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9318
9319 if (direction > 0)
9320 offset = end;
9321 else
9322 offset = start;
9323
9324 if (PyUnicode_READ(kind_self, data_self, offset) ==
9325 PyUnicode_READ(kind_sub, data_sub, 0) &&
9326 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9327 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9328 /* If both are of the same kind, memcmp is sufficient */
9329 if (kind_self == kind_sub) {
9330 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009331 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 data_sub,
9333 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009334 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335 }
9336 /* otherwise we have to compare each character by first accesing it */
9337 else {
9338 /* We do not need to compare 0 and len(substring)-1 because
9339 the if statement above ensured already that they are equal
9340 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 for (i = 1; i < end_sub; ++i) {
9342 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9343 PyUnicode_READ(kind_sub, data_sub, i))
9344 return 0;
9345 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 }
9349
9350 return 0;
9351}
9352
Alexander Belopolsky40018472011-02-26 01:02:56 +00009353Py_ssize_t
9354PyUnicode_Tailmatch(PyObject *str,
9355 PyObject *substr,
9356 Py_ssize_t start,
9357 Py_ssize_t end,
9358 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009360 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 str = PyUnicode_FromObject(str);
9363 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 substr = PyUnicode_FromObject(substr);
9366 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 Py_DECREF(str);
9368 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
Tim Petersced69f82003-09-16 20:30:58 +00009370
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009371 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 Py_DECREF(str);
9374 Py_DECREF(substr);
9375 return result;
9376}
9377
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378/* Apply fixfct filter to the Unicode object self and return a
9379 reference to the modified object */
9380
Alexander Belopolsky40018472011-02-26 01:02:56 +00009381static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009382fixup(PyObject *self,
9383 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 PyObject *u;
9386 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009387 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009389 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009392 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 /* fix functions return the new maximum character in a string,
9395 if the kind of the resulting unicode object does not change,
9396 everything is fine. Otherwise we need to change the string kind
9397 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009398 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009399
9400 if (maxchar_new == 0) {
9401 /* no changes */;
9402 if (PyUnicode_CheckExact(self)) {
9403 Py_DECREF(u);
9404 Py_INCREF(self);
9405 return self;
9406 }
9407 else
9408 return u;
9409 }
9410
Victor Stinnere6abb482012-05-02 01:15:40 +02009411 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412
Victor Stinnereaab6042011-12-11 22:22:39 +01009413 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009415
9416 /* In case the maximum character changed, we need to
9417 convert the string to the new category. */
9418 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9419 if (v == NULL) {
9420 Py_DECREF(u);
9421 return NULL;
9422 }
9423 if (maxchar_new > maxchar_old) {
9424 /* If the maxchar increased so that the kind changed, not all
9425 characters are representable anymore and we need to fix the
9426 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009427 _PyUnicode_FastCopyCharacters(v, 0,
9428 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009429 maxchar_old = fixfct(v);
9430 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 }
9432 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009433 _PyUnicode_FastCopyCharacters(v, 0,
9434 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009436 Py_DECREF(u);
9437 assert(_PyUnicode_CheckConsistency(v, 1));
9438 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439}
9440
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009441static PyObject *
9442ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009444 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9445 char *resdata, *data = PyUnicode_DATA(self);
9446 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009447
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009448 res = PyUnicode_New(len, 127);
9449 if (res == NULL)
9450 return NULL;
9451 resdata = PyUnicode_DATA(res);
9452 if (lower)
9453 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009455 _Py_bytes_upper(resdata, data, len);
9456 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457}
9458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009460handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009462 Py_ssize_t j;
9463 int final_sigma;
9464 Py_UCS4 c;
9465 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009466
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009467 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9468
9469 where ! is a negation and \p{xxx} is a character with property xxx.
9470 */
9471 for (j = i - 1; j >= 0; j--) {
9472 c = PyUnicode_READ(kind, data, j);
9473 if (!_PyUnicode_IsCaseIgnorable(c))
9474 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009476 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9477 if (final_sigma) {
9478 for (j = i + 1; j < length; j++) {
9479 c = PyUnicode_READ(kind, data, j);
9480 if (!_PyUnicode_IsCaseIgnorable(c))
9481 break;
9482 }
9483 final_sigma = j == length || !_PyUnicode_IsCased(c);
9484 }
9485 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486}
9487
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009488static int
9489lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9490 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009492 /* Obscure special case. */
9493 if (c == 0x3A3) {
9494 mapped[0] = handle_capital_sigma(kind, data, length, i);
9495 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009497 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498}
9499
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009500static Py_ssize_t
9501do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009503 Py_ssize_t i, k = 0;
9504 int n_res, j;
9505 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009506
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009507 c = PyUnicode_READ(kind, data, 0);
9508 n_res = _PyUnicode_ToUpperFull(c, mapped);
9509 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009510 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009511 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009513 for (i = 1; i < length; i++) {
9514 c = PyUnicode_READ(kind, data, i);
9515 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9516 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009517 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009518 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009519 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009520 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009521 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522}
9523
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009524static Py_ssize_t
9525do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9526 Py_ssize_t i, k = 0;
9527
9528 for (i = 0; i < length; i++) {
9529 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9530 int n_res, j;
9531 if (Py_UNICODE_ISUPPER(c)) {
9532 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9533 }
9534 else if (Py_UNICODE_ISLOWER(c)) {
9535 n_res = _PyUnicode_ToUpperFull(c, mapped);
9536 }
9537 else {
9538 n_res = 1;
9539 mapped[0] = c;
9540 }
9541 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009542 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009543 res[k++] = mapped[j];
9544 }
9545 }
9546 return k;
9547}
9548
9549static Py_ssize_t
9550do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9551 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009553 Py_ssize_t i, k = 0;
9554
9555 for (i = 0; i < length; i++) {
9556 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9557 int n_res, j;
9558 if (lower)
9559 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9560 else
9561 n_res = _PyUnicode_ToUpperFull(c, mapped);
9562 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009563 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009564 res[k++] = mapped[j];
9565 }
9566 }
9567 return k;
9568}
9569
9570static Py_ssize_t
9571do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9572{
9573 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9574}
9575
9576static Py_ssize_t
9577do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9578{
9579 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9580}
9581
Benjamin Petersone51757f2012-01-12 21:10:29 -05009582static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009583do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9584{
9585 Py_ssize_t i, k = 0;
9586
9587 for (i = 0; i < length; i++) {
9588 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9589 Py_UCS4 mapped[3];
9590 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9591 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009592 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009593 res[k++] = mapped[j];
9594 }
9595 }
9596 return k;
9597}
9598
9599static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009600do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9601{
9602 Py_ssize_t i, k = 0;
9603 int previous_is_cased;
9604
9605 previous_is_cased = 0;
9606 for (i = 0; i < length; i++) {
9607 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9608 Py_UCS4 mapped[3];
9609 int n_res, j;
9610
9611 if (previous_is_cased)
9612 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9613 else
9614 n_res = _PyUnicode_ToTitleFull(c, mapped);
9615
9616 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009617 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009618 res[k++] = mapped[j];
9619 }
9620
9621 previous_is_cased = _PyUnicode_IsCased(c);
9622 }
9623 return k;
9624}
9625
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009626static PyObject *
9627case_operation(PyObject *self,
9628 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9629{
9630 PyObject *res = NULL;
9631 Py_ssize_t length, newlength = 0;
9632 int kind, outkind;
9633 void *data, *outdata;
9634 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9635
Benjamin Petersoneea48462012-01-16 14:28:50 -05009636 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637
9638 kind = PyUnicode_KIND(self);
9639 data = PyUnicode_DATA(self);
9640 length = PyUnicode_GET_LENGTH(self);
9641 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9642 if (tmp == NULL)
9643 return PyErr_NoMemory();
9644 newlength = perform(kind, data, length, tmp, &maxchar);
9645 res = PyUnicode_New(newlength, maxchar);
9646 if (res == NULL)
9647 goto leave;
9648 tmpend = tmp + newlength;
9649 outdata = PyUnicode_DATA(res);
9650 outkind = PyUnicode_KIND(res);
9651 switch (outkind) {
9652 case PyUnicode_1BYTE_KIND:
9653 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9654 break;
9655 case PyUnicode_2BYTE_KIND:
9656 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9657 break;
9658 case PyUnicode_4BYTE_KIND:
9659 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9660 break;
9661 default:
9662 assert(0);
9663 break;
9664 }
9665 leave:
9666 PyMem_FREE(tmp);
9667 return res;
9668}
9669
Tim Peters8ce9f162004-08-27 01:49:32 +00009670PyObject *
9671PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009674 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009676 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009677 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9678 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009679 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009681 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009683 int use_memcpy;
9684 unsigned char *res_data = NULL, *sep_data = NULL;
9685 PyObject *last_obj;
9686 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009688 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009689 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009690 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009691 }
9692
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009693 /* NOTE: the following code can't call back into Python code,
9694 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009695 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009696
Tim Peters05eba1f2004-08-27 21:32:02 +00009697 seqlen = PySequence_Fast_GET_SIZE(fseq);
9698 /* If empty sequence, return u"". */
9699 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009700 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009701 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009702 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009703
Tim Peters05eba1f2004-08-27 21:32:02 +00009704 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009705 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009706 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009707 if (seqlen == 1) {
9708 if (PyUnicode_CheckExact(items[0])) {
9709 res = items[0];
9710 Py_INCREF(res);
9711 Py_DECREF(fseq);
9712 return res;
9713 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009714 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009715 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009716 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009717 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009718 /* Set up sep and seplen */
9719 if (separator == NULL) {
9720 /* fall back to a blank space separator */
9721 sep = PyUnicode_FromOrdinal(' ');
9722 if (!sep)
9723 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009724 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009725 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009726 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009727 else {
9728 if (!PyUnicode_Check(separator)) {
9729 PyErr_Format(PyExc_TypeError,
9730 "separator: expected str instance,"
9731 " %.80s found",
9732 Py_TYPE(separator)->tp_name);
9733 goto onError;
9734 }
9735 if (PyUnicode_READY(separator))
9736 goto onError;
9737 sep = separator;
9738 seplen = PyUnicode_GET_LENGTH(separator);
9739 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9740 /* inc refcount to keep this code path symmetric with the
9741 above case of a blank separator */
9742 Py_INCREF(sep);
9743 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009744 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009745 }
9746
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009747 /* There are at least two things to join, or else we have a subclass
9748 * of str in the sequence.
9749 * Do a pre-pass to figure out the total amount of space we'll
9750 * need (sz), and see whether all argument are strings.
9751 */
9752 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009753#ifdef Py_DEBUG
9754 use_memcpy = 0;
9755#else
9756 use_memcpy = 1;
9757#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009758 for (i = 0; i < seqlen; i++) {
9759 const Py_ssize_t old_sz = sz;
9760 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 if (!PyUnicode_Check(item)) {
9762 PyErr_Format(PyExc_TypeError,
9763 "sequence item %zd: expected str instance,"
9764 " %.80s found",
9765 i, Py_TYPE(item)->tp_name);
9766 goto onError;
9767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 if (PyUnicode_READY(item) == -1)
9769 goto onError;
9770 sz += PyUnicode_GET_LENGTH(item);
9771 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009772 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009773 if (i != 0)
9774 sz += seplen;
9775 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9776 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009777 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009778 goto onError;
9779 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009780 if (use_memcpy && last_obj != NULL) {
9781 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9782 use_memcpy = 0;
9783 }
9784 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009785 }
Tim Petersced69f82003-09-16 20:30:58 +00009786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009788 if (res == NULL)
9789 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009790
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009791 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009792#ifdef Py_DEBUG
9793 use_memcpy = 0;
9794#else
9795 if (use_memcpy) {
9796 res_data = PyUnicode_1BYTE_DATA(res);
9797 kind = PyUnicode_KIND(res);
9798 if (seplen != 0)
9799 sep_data = PyUnicode_1BYTE_DATA(sep);
9800 }
9801#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009802 if (use_memcpy) {
9803 for (i = 0; i < seqlen; ++i) {
9804 Py_ssize_t itemlen;
9805 item = items[i];
9806
9807 /* Copy item, and maybe the separator. */
9808 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009809 Py_MEMCPY(res_data,
9810 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009811 kind * seplen);
9812 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009813 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009814
9815 itemlen = PyUnicode_GET_LENGTH(item);
9816 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009817 Py_MEMCPY(res_data,
9818 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009819 kind * itemlen);
9820 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009821 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009822 }
9823 assert(res_data == PyUnicode_1BYTE_DATA(res)
9824 + kind * PyUnicode_GET_LENGTH(res));
9825 }
9826 else {
9827 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9828 Py_ssize_t itemlen;
9829 item = items[i];
9830
9831 /* Copy item, and maybe the separator. */
9832 if (i && seplen != 0) {
9833 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9834 res_offset += seplen;
9835 }
9836
9837 itemlen = PyUnicode_GET_LENGTH(item);
9838 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009839 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009840 res_offset += itemlen;
9841 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009842 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009843 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009844 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009845
Tim Peters05eba1f2004-08-27 21:32:02 +00009846 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009848 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850
Benjamin Peterson29060642009-01-31 22:14:21 +00009851 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009852 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009854 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855 return NULL;
9856}
9857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858#define FILL(kind, data, value, start, length) \
9859 do { \
9860 Py_ssize_t i_ = 0; \
9861 assert(kind != PyUnicode_WCHAR_KIND); \
9862 switch ((kind)) { \
9863 case PyUnicode_1BYTE_KIND: { \
9864 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009865 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 break; \
9867 } \
9868 case PyUnicode_2BYTE_KIND: { \
9869 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9870 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9871 break; \
9872 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009873 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9875 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9876 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009877 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 } \
9879 } \
9880 } while (0)
9881
Victor Stinnerd3f08822012-05-29 12:57:52 +02009882void
9883_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9884 Py_UCS4 fill_char)
9885{
9886 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9887 const void *data = PyUnicode_DATA(unicode);
9888 assert(PyUnicode_IS_READY(unicode));
9889 assert(unicode_modifiable(unicode));
9890 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9891 assert(start >= 0);
9892 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9893 FILL(kind, data, fill_char, start, length);
9894}
9895
Victor Stinner3fe55312012-01-04 00:33:50 +01009896Py_ssize_t
9897PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9898 Py_UCS4 fill_char)
9899{
9900 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009901
9902 if (!PyUnicode_Check(unicode)) {
9903 PyErr_BadInternalCall();
9904 return -1;
9905 }
9906 if (PyUnicode_READY(unicode) == -1)
9907 return -1;
9908 if (unicode_check_modifiable(unicode))
9909 return -1;
9910
Victor Stinnerd3f08822012-05-29 12:57:52 +02009911 if (start < 0) {
9912 PyErr_SetString(PyExc_IndexError, "string index out of range");
9913 return -1;
9914 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009915 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9916 PyErr_SetString(PyExc_ValueError,
9917 "fill character is bigger than "
9918 "the string maximum character");
9919 return -1;
9920 }
9921
9922 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9923 length = Py_MIN(maxlen, length);
9924 if (length <= 0)
9925 return 0;
9926
Victor Stinnerd3f08822012-05-29 12:57:52 +02009927 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009928 return length;
9929}
9930
Victor Stinner9310abb2011-10-05 00:59:23 +02009931static PyObject *
9932pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009933 Py_ssize_t left,
9934 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 PyObject *u;
9938 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009939 int kind;
9940 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941
9942 if (left < 0)
9943 left = 0;
9944 if (right < 0)
9945 right = 0;
9946
Victor Stinnerc4b49542011-12-11 22:44:26 +01009947 if (left == 0 && right == 0)
9948 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9951 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009952 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9953 return NULL;
9954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009956 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009958 if (!u)
9959 return NULL;
9960
9961 kind = PyUnicode_KIND(u);
9962 data = PyUnicode_DATA(u);
9963 if (left)
9964 FILL(kind, data, fill, 0, left);
9965 if (right)
9966 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009967 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009968 assert(_PyUnicode_CheckConsistency(u, 1));
9969 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970}
9971
Alexander Belopolsky40018472011-02-26 01:02:56 +00009972PyObject *
9973PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976
9977 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009978 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009979 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009980 if (PyUnicode_READY(string) == -1) {
9981 Py_DECREF(string);
9982 return NULL;
9983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984
Benjamin Petersonead6b532011-12-20 17:23:42 -06009985 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009987 if (PyUnicode_IS_ASCII(string))
9988 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009989 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009990 PyUnicode_GET_LENGTH(string), keepends);
9991 else
9992 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009993 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 break;
9996 case PyUnicode_2BYTE_KIND:
9997 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009998 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 PyUnicode_GET_LENGTH(string), keepends);
10000 break;
10001 case PyUnicode_4BYTE_KIND:
10002 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010003 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 PyUnicode_GET_LENGTH(string), keepends);
10005 break;
10006 default:
10007 assert(0);
10008 list = 0;
10009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010 Py_DECREF(string);
10011 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012}
10013
Alexander Belopolsky40018472011-02-26 01:02:56 +000010014static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010015split(PyObject *self,
10016 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010017 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 int kind1, kind2, kind;
10020 void *buf1, *buf2;
10021 Py_ssize_t len1, len2;
10022 PyObject* out;
10023
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010025 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 if (PyUnicode_READY(self) == -1)
10028 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010031 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010033 if (PyUnicode_IS_ASCII(self))
10034 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010035 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010036 PyUnicode_GET_LENGTH(self), maxcount
10037 );
10038 else
10039 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010040 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010041 PyUnicode_GET_LENGTH(self), maxcount
10042 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 case PyUnicode_2BYTE_KIND:
10044 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010045 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 PyUnicode_GET_LENGTH(self), maxcount
10047 );
10048 case PyUnicode_4BYTE_KIND:
10049 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010050 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 PyUnicode_GET_LENGTH(self), maxcount
10052 );
10053 default:
10054 assert(0);
10055 return NULL;
10056 }
10057
10058 if (PyUnicode_READY(substring) == -1)
10059 return NULL;
10060
10061 kind1 = PyUnicode_KIND(self);
10062 kind2 = PyUnicode_KIND(substring);
10063 kind = kind1 > kind2 ? kind1 : kind2;
10064 buf1 = PyUnicode_DATA(self);
10065 buf2 = PyUnicode_DATA(substring);
10066 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010067 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 if (!buf1)
10069 return NULL;
10070 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010071 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 if (!buf2) {
10073 if (kind1 != kind) PyMem_Free(buf1);
10074 return NULL;
10075 }
10076 len1 = PyUnicode_GET_LENGTH(self);
10077 len2 = PyUnicode_GET_LENGTH(substring);
10078
Benjamin Petersonead6b532011-12-20 17:23:42 -060010079 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010081 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10082 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010083 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010084 else
10085 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010086 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 break;
10088 case PyUnicode_2BYTE_KIND:
10089 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010090 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 break;
10092 case PyUnicode_4BYTE_KIND:
10093 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010094 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 break;
10096 default:
10097 out = NULL;
10098 }
10099 if (kind1 != kind)
10100 PyMem_Free(buf1);
10101 if (kind2 != kind)
10102 PyMem_Free(buf2);
10103 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104}
10105
Alexander Belopolsky40018472011-02-26 01:02:56 +000010106static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010107rsplit(PyObject *self,
10108 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010109 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 int kind1, kind2, kind;
10112 void *buf1, *buf2;
10113 Py_ssize_t len1, len2;
10114 PyObject* out;
10115
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010116 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010117 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (PyUnicode_READY(self) == -1)
10120 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010123 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010125 if (PyUnicode_IS_ASCII(self))
10126 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010127 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010128 PyUnicode_GET_LENGTH(self), maxcount
10129 );
10130 else
10131 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010132 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010133 PyUnicode_GET_LENGTH(self), maxcount
10134 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 case PyUnicode_2BYTE_KIND:
10136 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010137 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 PyUnicode_GET_LENGTH(self), maxcount
10139 );
10140 case PyUnicode_4BYTE_KIND:
10141 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010142 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 PyUnicode_GET_LENGTH(self), maxcount
10144 );
10145 default:
10146 assert(0);
10147 return NULL;
10148 }
10149
10150 if (PyUnicode_READY(substring) == -1)
10151 return NULL;
10152
10153 kind1 = PyUnicode_KIND(self);
10154 kind2 = PyUnicode_KIND(substring);
10155 kind = kind1 > kind2 ? kind1 : kind2;
10156 buf1 = PyUnicode_DATA(self);
10157 buf2 = PyUnicode_DATA(substring);
10158 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010159 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 if (!buf1)
10161 return NULL;
10162 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010163 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 if (!buf2) {
10165 if (kind1 != kind) PyMem_Free(buf1);
10166 return NULL;
10167 }
10168 len1 = PyUnicode_GET_LENGTH(self);
10169 len2 = PyUnicode_GET_LENGTH(substring);
10170
Benjamin Petersonead6b532011-12-20 17:23:42 -060010171 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010173 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10174 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010175 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010176 else
10177 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010178 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 break;
10180 case PyUnicode_2BYTE_KIND:
10181 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010182 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 break;
10184 case PyUnicode_4BYTE_KIND:
10185 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010186 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 break;
10188 default:
10189 out = NULL;
10190 }
10191 if (kind1 != kind)
10192 PyMem_Free(buf1);
10193 if (kind2 != kind)
10194 PyMem_Free(buf2);
10195 return out;
10196}
10197
10198static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10200 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010202 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10205 return asciilib_find(buf1, len1, buf2, len2, offset);
10206 else
10207 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 case PyUnicode_2BYTE_KIND:
10209 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10210 case PyUnicode_4BYTE_KIND:
10211 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10212 }
10213 assert(0);
10214 return -1;
10215}
10216
10217static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10219 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010221 switch (kind) {
10222 case PyUnicode_1BYTE_KIND:
10223 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10224 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10225 else
10226 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10227 case PyUnicode_2BYTE_KIND:
10228 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10229 case PyUnicode_4BYTE_KIND:
10230 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10231 }
10232 assert(0);
10233 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010234}
10235
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010236static void
10237replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10238 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10239{
10240 int kind = PyUnicode_KIND(u);
10241 void *data = PyUnicode_DATA(u);
10242 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10243 if (kind == PyUnicode_1BYTE_KIND) {
10244 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10245 (Py_UCS1 *)data + len,
10246 u1, u2, maxcount);
10247 }
10248 else if (kind == PyUnicode_2BYTE_KIND) {
10249 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10250 (Py_UCS2 *)data + len,
10251 u1, u2, maxcount);
10252 }
10253 else {
10254 assert(kind == PyUnicode_4BYTE_KIND);
10255 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10256 (Py_UCS4 *)data + len,
10257 u1, u2, maxcount);
10258 }
10259}
10260
Alexander Belopolsky40018472011-02-26 01:02:56 +000010261static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262replace(PyObject *self, PyObject *str1,
10263 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 PyObject *u;
10266 char *sbuf = PyUnicode_DATA(self);
10267 char *buf1 = PyUnicode_DATA(str1);
10268 char *buf2 = PyUnicode_DATA(str2);
10269 int srelease = 0, release1 = 0, release2 = 0;
10270 int skind = PyUnicode_KIND(self);
10271 int kind1 = PyUnicode_KIND(str1);
10272 int kind2 = PyUnicode_KIND(str2);
10273 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10274 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10275 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010276 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010277 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278
10279 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010282 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283
Victor Stinner59de0ee2011-10-07 10:01:28 +020010284 if (str1 == str2)
10285 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286
Victor Stinner49a0a212011-10-12 23:46:10 +020010287 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010288 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10289 if (maxchar < maxchar_str1)
10290 /* substring too wide to be present */
10291 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010292 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10293 /* Replacing str1 with str2 may cause a maxchar reduction in the
10294 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010295 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010296 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010299 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010301 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010303 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010304 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010305 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010306
Victor Stinner69ed0f42013-04-09 21:48:24 +020010307 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010308 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010309 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010310 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010311 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010315
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010316 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10317 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010318 }
10319 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 int rkind = skind;
10321 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010322 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 if (kind1 < rkind) {
10325 /* widen substring */
10326 buf1 = _PyUnicode_AsKind(str1, rkind);
10327 if (!buf1) goto error;
10328 release1 = 1;
10329 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010330 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010331 if (i < 0)
10332 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 if (rkind > kind2) {
10334 /* widen replacement */
10335 buf2 = _PyUnicode_AsKind(str2, rkind);
10336 if (!buf2) goto error;
10337 release2 = 1;
10338 }
10339 else if (rkind < kind2) {
10340 /* widen self and buf1 */
10341 rkind = kind2;
10342 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010343 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 sbuf = _PyUnicode_AsKind(self, rkind);
10345 if (!sbuf) goto error;
10346 srelease = 1;
10347 buf1 = _PyUnicode_AsKind(str1, rkind);
10348 if (!buf1) goto error;
10349 release1 = 1;
10350 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010351 u = PyUnicode_New(slen, maxchar);
10352 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010354 assert(PyUnicode_KIND(u) == rkind);
10355 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010356
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010357 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010358 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010359 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010361 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010363
10364 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010365 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010366 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010368 if (i == -1)
10369 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010370 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010372 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010376 }
10377 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010379 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 int rkind = skind;
10381 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010384 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 buf1 = _PyUnicode_AsKind(str1, rkind);
10386 if (!buf1) goto error;
10387 release1 = 1;
10388 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010389 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010390 if (n == 0)
10391 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010393 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 buf2 = _PyUnicode_AsKind(str2, rkind);
10395 if (!buf2) goto error;
10396 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010399 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 rkind = kind2;
10401 sbuf = _PyUnicode_AsKind(self, rkind);
10402 if (!sbuf) goto error;
10403 srelease = 1;
10404 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010405 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 buf1 = _PyUnicode_AsKind(str1, rkind);
10407 if (!buf1) goto error;
10408 release1 = 1;
10409 }
10410 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10411 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010412 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 PyErr_SetString(PyExc_OverflowError,
10414 "replace string is too long");
10415 goto error;
10416 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010417 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010418 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010419 _Py_INCREF_UNICODE_EMPTY();
10420 if (!unicode_empty)
10421 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010422 u = unicode_empty;
10423 goto done;
10424 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010425 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 PyErr_SetString(PyExc_OverflowError,
10427 "replace string is too long");
10428 goto error;
10429 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010430 u = PyUnicode_New(new_size, maxchar);
10431 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010433 assert(PyUnicode_KIND(u) == rkind);
10434 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 ires = i = 0;
10436 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010437 while (n-- > 0) {
10438 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010439 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010440 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010441 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010442 if (j == -1)
10443 break;
10444 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010445 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010446 memcpy(res + rkind * ires,
10447 sbuf + rkind * i,
10448 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 }
10451 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010453 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010455 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010462 memcpy(res + rkind * ires,
10463 sbuf + rkind * i,
10464 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010465 }
10466 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010467 /* interleave */
10468 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010469 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010471 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473 if (--n <= 0)
10474 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 memcpy(res + rkind * ires,
10476 sbuf + rkind * i,
10477 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 ires++;
10479 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010480 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010481 memcpy(res + rkind * ires,
10482 sbuf + rkind * i,
10483 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010485 }
10486
10487 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010488 unicode_adjust_maxchar(&u);
10489 if (u == NULL)
10490 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010492
10493 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (srelease)
10495 PyMem_FREE(sbuf);
10496 if (release1)
10497 PyMem_FREE(buf1);
10498 if (release2)
10499 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010500 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010502
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 if (srelease)
10506 PyMem_FREE(sbuf);
10507 if (release1)
10508 PyMem_FREE(buf1);
10509 if (release2)
10510 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010511 return unicode_result_unchanged(self);
10512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 error:
10514 if (srelease && sbuf)
10515 PyMem_FREE(sbuf);
10516 if (release1 && buf1)
10517 PyMem_FREE(buf1);
10518 if (release2 && buf2)
10519 PyMem_FREE(buf2);
10520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521}
10522
10523/* --- Unicode Object Methods --------------------------------------------- */
10524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010525PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010526 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527\n\
10528Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010529characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530
10531static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010532unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010534 if (PyUnicode_READY(self) == -1)
10535 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010536 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537}
10538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010539PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010540 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541\n\
10542Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010543have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544
10545static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010546unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010548 if (PyUnicode_READY(self) == -1)
10549 return NULL;
10550 if (PyUnicode_GET_LENGTH(self) == 0)
10551 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010552 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553}
10554
Benjamin Petersond5890c82012-01-14 13:23:30 -050010555PyDoc_STRVAR(casefold__doc__,
10556 "S.casefold() -> str\n\
10557\n\
10558Return a version of S suitable for caseless comparisons.");
10559
10560static PyObject *
10561unicode_casefold(PyObject *self)
10562{
10563 if (PyUnicode_READY(self) == -1)
10564 return NULL;
10565 if (PyUnicode_IS_ASCII(self))
10566 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010567 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010568}
10569
10570
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010571/* Argument converter. Coerces to a single unicode character */
10572
10573static int
10574convert_uc(PyObject *obj, void *addr)
10575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010577 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010578
Benjamin Peterson14339b62009-01-31 16:36:08 +000010579 uniobj = PyUnicode_FromObject(obj);
10580 if (uniobj == NULL) {
10581 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010583 return 0;
10584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010586 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010587 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010588 Py_DECREF(uniobj);
10589 return 0;
10590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010592 Py_DECREF(uniobj);
10593 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010594}
10595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010596PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010599Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010600done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601
10602static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010603unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010605 Py_ssize_t marg, left;
10606 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 Py_UCS4 fillchar = ' ';
10608
Victor Stinnere9a29352011-10-01 02:14:59 +020010609 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Benjamin Petersonbac79492012-01-14 13:34:47 -050010612 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 return NULL;
10614
Victor Stinnerc4b49542011-12-11 22:44:26 +010010615 if (PyUnicode_GET_LENGTH(self) >= width)
10616 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Victor Stinnerc4b49542011-12-11 22:44:26 +010010618 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 left = marg / 2 + (marg & width & 1);
10620
Victor Stinner9310abb2011-10-05 00:59:23 +020010621 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622}
10623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624/* This function assumes that str1 and str2 are readied by the caller. */
10625
Marc-André Lemburge5034372000-08-08 08:04:29 +000010626static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010627unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010628{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010629#define COMPARE(TYPE1, TYPE2) \
10630 do { \
10631 TYPE1* p1 = (TYPE1 *)data1; \
10632 TYPE2* p2 = (TYPE2 *)data2; \
10633 TYPE1* end = p1 + len; \
10634 Py_UCS4 c1, c2; \
10635 for (; p1 != end; p1++, p2++) { \
10636 c1 = *p1; \
10637 c2 = *p2; \
10638 if (c1 != c2) \
10639 return (c1 < c2) ? -1 : 1; \
10640 } \
10641 } \
10642 while (0)
10643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 int kind1, kind2;
10645 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010646 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 kind1 = PyUnicode_KIND(str1);
10649 kind2 = PyUnicode_KIND(str2);
10650 data1 = PyUnicode_DATA(str1);
10651 data2 = PyUnicode_DATA(str2);
10652 len1 = PyUnicode_GET_LENGTH(str1);
10653 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010654 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010655
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010656 switch(kind1) {
10657 case PyUnicode_1BYTE_KIND:
10658 {
10659 switch(kind2) {
10660 case PyUnicode_1BYTE_KIND:
10661 {
10662 int cmp = memcmp(data1, data2, len);
10663 /* normalize result of memcmp() into the range [-1; 1] */
10664 if (cmp < 0)
10665 return -1;
10666 if (cmp > 0)
10667 return 1;
10668 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010669 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010670 case PyUnicode_2BYTE_KIND:
10671 COMPARE(Py_UCS1, Py_UCS2);
10672 break;
10673 case PyUnicode_4BYTE_KIND:
10674 COMPARE(Py_UCS1, Py_UCS4);
10675 break;
10676 default:
10677 assert(0);
10678 }
10679 break;
10680 }
10681 case PyUnicode_2BYTE_KIND:
10682 {
10683 switch(kind2) {
10684 case PyUnicode_1BYTE_KIND:
10685 COMPARE(Py_UCS2, Py_UCS1);
10686 break;
10687 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010688 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010689 COMPARE(Py_UCS2, Py_UCS2);
10690 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010691 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010692 case PyUnicode_4BYTE_KIND:
10693 COMPARE(Py_UCS2, Py_UCS4);
10694 break;
10695 default:
10696 assert(0);
10697 }
10698 break;
10699 }
10700 case PyUnicode_4BYTE_KIND:
10701 {
10702 switch(kind2) {
10703 case PyUnicode_1BYTE_KIND:
10704 COMPARE(Py_UCS4, Py_UCS1);
10705 break;
10706 case PyUnicode_2BYTE_KIND:
10707 COMPARE(Py_UCS4, Py_UCS2);
10708 break;
10709 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010710 {
10711#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10712 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10713 /* normalize result of wmemcmp() into the range [-1; 1] */
10714 if (cmp < 0)
10715 return -1;
10716 if (cmp > 0)
10717 return 1;
10718#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010719 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010720#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010721 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010722 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010723 default:
10724 assert(0);
10725 }
10726 break;
10727 }
10728 default:
10729 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010730 }
10731
Victor Stinner770e19e2012-10-04 22:59:45 +020010732 if (len1 == len2)
10733 return 0;
10734 if (len1 < len2)
10735 return -1;
10736 else
10737 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010738
10739#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010740}
10741
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010742Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010743unicode_compare_eq(PyObject *str1, PyObject *str2)
10744{
10745 int kind;
10746 void *data1, *data2;
10747 Py_ssize_t len;
10748 int cmp;
10749
Victor Stinnere5567ad2012-10-23 02:48:49 +020010750 len = PyUnicode_GET_LENGTH(str1);
10751 if (PyUnicode_GET_LENGTH(str2) != len)
10752 return 0;
10753 kind = PyUnicode_KIND(str1);
10754 if (PyUnicode_KIND(str2) != kind)
10755 return 0;
10756 data1 = PyUnicode_DATA(str1);
10757 data2 = PyUnicode_DATA(str2);
10758
10759 cmp = memcmp(data1, data2, len * kind);
10760 return (cmp == 0);
10761}
10762
10763
Alexander Belopolsky40018472011-02-26 01:02:56 +000010764int
10765PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10768 if (PyUnicode_READY(left) == -1 ||
10769 PyUnicode_READY(right) == -1)
10770 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010771
10772 /* a string is equal to itself */
10773 if (left == right)
10774 return 0;
10775
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010776 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010778 PyErr_Format(PyExc_TypeError,
10779 "Can't compare %.100s and %.100s",
10780 left->ob_type->tp_name,
10781 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782 return -1;
10783}
10784
Martin v. Löwis5b222132007-06-10 09:51:05 +000010785int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010786_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10787{
10788 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10789 if (right_str == NULL)
10790 return -1;
10791 return PyUnicode_Compare(left, right_str);
10792}
10793
10794int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010795PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 Py_ssize_t i;
10798 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 Py_UCS4 chr;
10800
Victor Stinner910337b2011-10-03 03:20:16 +020010801 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 if (PyUnicode_READY(uni) == -1)
10803 return -1;
10804 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010805 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010806 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010807 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010808 size_t len, len2 = strlen(str);
10809 int cmp;
10810
10811 len = Py_MIN(len1, len2);
10812 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010813 if (cmp != 0) {
10814 if (cmp < 0)
10815 return -1;
10816 else
10817 return 1;
10818 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010819 if (len1 > len2)
10820 return 1; /* uni is longer */
10821 if (len2 > len1)
10822 return -1; /* str is longer */
10823 return 0;
10824 }
10825 else {
10826 void *data = PyUnicode_DATA(uni);
10827 /* Compare Unicode string and source character set string */
10828 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10829 if (chr != str[i])
10830 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10831 /* This check keeps Python strings that end in '\0' from comparing equal
10832 to C strings identical up to that point. */
10833 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10834 return 1; /* uni is longer */
10835 if (str[i])
10836 return -1; /* str is longer */
10837 return 0;
10838 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010839}
10840
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010841
Benjamin Peterson29060642009-01-31 22:14:21 +000010842#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010843 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010844
Alexander Belopolsky40018472011-02-26 01:02:56 +000010845PyObject *
10846PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010847{
10848 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010849 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010850
Victor Stinnere5567ad2012-10-23 02:48:49 +020010851 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10852 Py_RETURN_NOTIMPLEMENTED;
10853
10854 if (PyUnicode_READY(left) == -1 ||
10855 PyUnicode_READY(right) == -1)
10856 return NULL;
10857
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010858 if (left == right) {
10859 switch (op) {
10860 case Py_EQ:
10861 case Py_LE:
10862 case Py_GE:
10863 /* a string is equal to itself */
10864 v = Py_True;
10865 break;
10866 case Py_NE:
10867 case Py_LT:
10868 case Py_GT:
10869 v = Py_False;
10870 break;
10871 default:
10872 PyErr_BadArgument();
10873 return NULL;
10874 }
10875 }
10876 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010877 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010878 result ^= (op == Py_NE);
10879 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010880 }
10881 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010882 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010883
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010884 /* Convert the return value to a Boolean */
10885 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010886 case Py_LE:
10887 v = TEST_COND(result <= 0);
10888 break;
10889 case Py_GE:
10890 v = TEST_COND(result >= 0);
10891 break;
10892 case Py_LT:
10893 v = TEST_COND(result == -1);
10894 break;
10895 case Py_GT:
10896 v = TEST_COND(result == 1);
10897 break;
10898 default:
10899 PyErr_BadArgument();
10900 return NULL;
10901 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010902 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010903 Py_INCREF(v);
10904 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010905}
10906
Alexander Belopolsky40018472011-02-26 01:02:56 +000010907int
10908PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010909{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010911 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 void *buf1, *buf2;
10913 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010914 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010915
10916 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010917 sub = PyUnicode_FromObject(element);
10918 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 PyErr_Format(PyExc_TypeError,
10920 "'in <string>' requires string as left operand, not %s",
10921 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010923 }
10924
Thomas Wouters477c8d52006-05-27 19:21:47 +000010925 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010926 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 Py_DECREF(sub);
10928 return -1;
10929 }
10930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 kind1 = PyUnicode_KIND(str);
10932 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 buf1 = PyUnicode_DATA(str);
10934 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010935 if (kind2 != kind1) {
10936 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010937 Py_DECREF(sub);
10938 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010939 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010940 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010941 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 if (!buf2) {
10944 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010945 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 return -1;
10947 }
10948 len1 = PyUnicode_GET_LENGTH(str);
10949 len2 = PyUnicode_GET_LENGTH(sub);
10950
Victor Stinner77282cb2013-04-14 19:22:47 +020010951 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 case PyUnicode_1BYTE_KIND:
10953 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10954 break;
10955 case PyUnicode_2BYTE_KIND:
10956 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10957 break;
10958 case PyUnicode_4BYTE_KIND:
10959 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10960 break;
10961 default:
10962 result = -1;
10963 assert(0);
10964 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010965
10966 Py_DECREF(str);
10967 Py_DECREF(sub);
10968
Victor Stinner77282cb2013-04-14 19:22:47 +020010969 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 PyMem_Free(buf2);
10971
Guido van Rossum403d68b2000-03-13 15:55:09 +000010972 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010973}
10974
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975/* Concat to string or Unicode object giving a new Unicode object. */
10976
Alexander Belopolsky40018472011-02-26 01:02:56 +000010977PyObject *
10978PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010981 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010982 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
10984 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991
10992 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010993 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010997 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 }
11001
Victor Stinner488fa492011-12-12 00:01:39 +010011002 u_len = PyUnicode_GET_LENGTH(u);
11003 v_len = PyUnicode_GET_LENGTH(v);
11004 if (u_len > PY_SSIZE_T_MAX - v_len) {
11005 PyErr_SetString(PyExc_OverflowError,
11006 "strings are too large to concat");
11007 goto onError;
11008 }
11009 new_len = u_len + v_len;
11010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011012 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011013 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011016 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011019 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11020 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 Py_DECREF(u);
11022 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011023 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 Py_XDECREF(u);
11028 Py_XDECREF(v);
11029 return NULL;
11030}
11031
Walter Dörwald1ab83302007-05-18 17:15:44 +000011032void
Victor Stinner23e56682011-10-03 03:54:37 +020011033PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011034{
Victor Stinner23e56682011-10-03 03:54:37 +020011035 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011036 Py_UCS4 maxchar, maxchar2;
11037 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011038
11039 if (p_left == NULL) {
11040 if (!PyErr_Occurred())
11041 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011042 return;
11043 }
Victor Stinner23e56682011-10-03 03:54:37 +020011044 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011045 if (right == NULL || left == NULL
11046 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011047 if (!PyErr_Occurred())
11048 PyErr_BadInternalCall();
11049 goto error;
11050 }
11051
Benjamin Petersonbac79492012-01-14 13:34:47 -050011052 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011053 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011054 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011055 goto error;
11056
Victor Stinner488fa492011-12-12 00:01:39 +010011057 /* Shortcuts */
11058 if (left == unicode_empty) {
11059 Py_DECREF(left);
11060 Py_INCREF(right);
11061 *p_left = right;
11062 return;
11063 }
11064 if (right == unicode_empty)
11065 return;
11066
11067 left_len = PyUnicode_GET_LENGTH(left);
11068 right_len = PyUnicode_GET_LENGTH(right);
11069 if (left_len > PY_SSIZE_T_MAX - right_len) {
11070 PyErr_SetString(PyExc_OverflowError,
11071 "strings are too large to concat");
11072 goto error;
11073 }
11074 new_len = left_len + right_len;
11075
11076 if (unicode_modifiable(left)
11077 && PyUnicode_CheckExact(right)
11078 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011079 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11080 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011081 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011082 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011083 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11084 {
11085 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011086 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011087 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011088
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011089 /* copy 'right' into the newly allocated area of 'left' */
11090 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011091 }
Victor Stinner488fa492011-12-12 00:01:39 +010011092 else {
11093 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11094 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011095 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011096
Victor Stinner488fa492011-12-12 00:01:39 +010011097 /* Concat the two Unicode strings */
11098 res = PyUnicode_New(new_len, maxchar);
11099 if (res == NULL)
11100 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011101 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11102 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011103 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011104 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011105 }
11106 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011107 return;
11108
11109error:
Victor Stinner488fa492011-12-12 00:01:39 +010011110 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011111}
11112
11113void
11114PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11115{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011116 PyUnicode_Append(pleft, right);
11117 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011118}
11119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011120PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011121 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011123Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011124string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011125interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
11127static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011128unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011130 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011131 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011132 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 int kind1, kind2, kind;
11135 void *buf1, *buf2;
11136 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
Jesus Ceaac451502011-04-20 17:09:23 +020011138 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11139 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011140 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 kind1 = PyUnicode_KIND(self);
11143 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011144 if (kind2 > kind1) {
11145 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011146 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011147 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011148 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 buf1 = PyUnicode_DATA(self);
11150 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011152 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 if (!buf2) {
11154 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 return NULL;
11156 }
11157 len1 = PyUnicode_GET_LENGTH(self);
11158 len2 = PyUnicode_GET_LENGTH(substring);
11159
11160 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011161 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 case PyUnicode_1BYTE_KIND:
11163 iresult = ucs1lib_count(
11164 ((Py_UCS1*)buf1) + start, end - start,
11165 buf2, len2, PY_SSIZE_T_MAX
11166 );
11167 break;
11168 case PyUnicode_2BYTE_KIND:
11169 iresult = ucs2lib_count(
11170 ((Py_UCS2*)buf1) + start, end - start,
11171 buf2, len2, PY_SSIZE_T_MAX
11172 );
11173 break;
11174 case PyUnicode_4BYTE_KIND:
11175 iresult = ucs4lib_count(
11176 ((Py_UCS4*)buf1) + start, end - start,
11177 buf2, len2, PY_SSIZE_T_MAX
11178 );
11179 break;
11180 default:
11181 assert(0); iresult = 0;
11182 }
11183
11184 result = PyLong_FromSsize_t(iresult);
11185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 if (kind2 != kind)
11187 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
11189 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011190
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 return result;
11192}
11193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011194PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011195 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011197Encode S using the codec registered for encoding. Default encoding\n\
11198is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011199handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011200a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11201'xmlcharrefreplace' as well as any other name registered with\n\
11202codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
11204static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011205unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011207 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208 char *encoding = NULL;
11209 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011210
Benjamin Peterson308d6372009-09-18 21:42:35 +000011211 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11212 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011214 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011215}
11216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011217PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011218 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219\n\
11220Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011221If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222
11223static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011224unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011226 Py_ssize_t i, j, line_pos, src_len, incr;
11227 Py_UCS4 ch;
11228 PyObject *u;
11229 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011230 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011232 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011233 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234
Ezio Melotti745d54d2013-11-16 19:10:57 +020011235 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11236 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
Antoine Pitrou22425222011-10-04 19:10:51 +020011239 if (PyUnicode_READY(self) == -1)
11240 return NULL;
11241
Thomas Wouters7e474022000-07-16 12:04:32 +000011242 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011243 src_len = PyUnicode_GET_LENGTH(self);
11244 i = j = line_pos = 0;
11245 kind = PyUnicode_KIND(self);
11246 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011247 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011248 for (; i < src_len; i++) {
11249 ch = PyUnicode_READ(kind, src_data, i);
11250 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011251 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011253 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011254 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011255 goto overflow;
11256 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011257 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011258 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011262 goto overflow;
11263 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011265 if (ch == '\n' || ch == '\r')
11266 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011268 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011269 if (!found)
11270 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011271
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011273 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 if (!u)
11275 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011276 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277
Antoine Pitroue71d5742011-10-04 15:55:09 +020011278 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279
Antoine Pitroue71d5742011-10-04 15:55:09 +020011280 for (; i < src_len; i++) {
11281 ch = PyUnicode_READ(kind, src_data, i);
11282 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011284 incr = tabsize - (line_pos % tabsize);
11285 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011286 FILL(kind, dest_data, ' ', j, incr);
11287 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011289 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011291 line_pos++;
11292 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011293 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011294 if (ch == '\n' || ch == '\r')
11295 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 }
11298 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011299 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011300
Antoine Pitroue71d5742011-10-04 15:55:09 +020011301 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011302 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304}
11305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011306PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308\n\
11309Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011310such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311arguments start and end are interpreted as in slice notation.\n\
11312\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011313Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
11315static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011318 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011319 Py_ssize_t start;
11320 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011321 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
Jesus Ceaac451502011-04-20 17:09:23 +020011323 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11324 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
Christian Heimesd47802e2013-06-29 21:33:36 +020011327 if (PyUnicode_READY(self) == -1) {
11328 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011330 }
11331 if (PyUnicode_READY(substring) == -1) {
11332 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335
Victor Stinner7931d9a2011-11-04 00:22:48 +010011336 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337
11338 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 if (result == -2)
11341 return NULL;
11342
Christian Heimes217cfd12007-12-02 14:31:20 +000011343 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344}
11345
11346static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011347unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011349 void *data;
11350 enum PyUnicode_Kind kind;
11351 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011352
11353 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11354 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011356 }
11357 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11358 PyErr_SetString(PyExc_IndexError, "string index out of range");
11359 return NULL;
11360 }
11361 kind = PyUnicode_KIND(self);
11362 data = PyUnicode_DATA(self);
11363 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011364 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365}
11366
Guido van Rossumc2504932007-09-18 19:42:40 +000011367/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011368 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011369static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011370unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371{
Guido van Rossumc2504932007-09-18 19:42:40 +000011372 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011373 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011374
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011375#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011376 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011377#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 if (_PyUnicode_HASH(self) != -1)
11379 return _PyUnicode_HASH(self);
11380 if (PyUnicode_READY(self) == -1)
11381 return -1;
11382 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011383 /*
11384 We make the hash of the empty string be 0, rather than using
11385 (prefix ^ suffix), since this slightly obfuscates the hash secret
11386 */
11387 if (len == 0) {
11388 _PyUnicode_HASH(self) = 0;
11389 return 0;
11390 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011391 x = _Py_HashBytes(PyUnicode_DATA(self),
11392 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011394 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395}
11396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011397PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011398 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011400Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401
11402static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011405 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011406 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011407 Py_ssize_t start;
11408 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409
Jesus Ceaac451502011-04-20 17:09:23 +020011410 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11411 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
Christian Heimesd47a0452013-06-29 21:21:37 +020011414 if (PyUnicode_READY(self) == -1) {
11415 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011417 }
11418 if (PyUnicode_READY(substring) == -1) {
11419 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422
Victor Stinner7931d9a2011-11-04 00:22:48 +010011423 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
11425 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (result == -2)
11428 return NULL;
11429
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430 if (result < 0) {
11431 PyErr_SetString(PyExc_ValueError, "substring not found");
11432 return NULL;
11433 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011434
Christian Heimes217cfd12007-12-02 14:31:20 +000011435 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436}
11437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011438PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011441Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011442at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
11444static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011445unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 Py_ssize_t i, length;
11448 int kind;
11449 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 int cased;
11451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 if (PyUnicode_READY(self) == -1)
11453 return NULL;
11454 length = PyUnicode_GET_LENGTH(self);
11455 kind = PyUnicode_KIND(self);
11456 data = PyUnicode_DATA(self);
11457
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 if (length == 1)
11460 return PyBool_FromLong(
11461 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011463 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011466
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 for (i = 0; i < length; i++) {
11469 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011470
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11472 return PyBool_FromLong(0);
11473 else if (!cased && Py_UNICODE_ISLOWER(ch))
11474 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011476 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477}
11478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011479PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011482Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
11485static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011486unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 Py_ssize_t i, length;
11489 int kind;
11490 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 int cased;
11492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 if (PyUnicode_READY(self) == -1)
11494 return NULL;
11495 length = PyUnicode_GET_LENGTH(self);
11496 kind = PyUnicode_KIND(self);
11497 data = PyUnicode_DATA(self);
11498
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 if (length == 1)
11501 return PyBool_FromLong(
11502 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011504 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011507
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 for (i = 0; i < length; i++) {
11510 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011511
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11513 return PyBool_FromLong(0);
11514 else if (!cased && Py_UNICODE_ISUPPER(ch))
11515 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011517 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518}
11519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011520PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011523Return True if S is a titlecased string and there is at least one\n\
11524character in S, i.e. upper- and titlecase characters may only\n\
11525follow uncased characters and lowercase characters only cased ones.\n\
11526Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
11528static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011529unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 Py_ssize_t i, length;
11532 int kind;
11533 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 int cased, previous_is_cased;
11535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 if (PyUnicode_READY(self) == -1)
11537 return NULL;
11538 length = PyUnicode_GET_LENGTH(self);
11539 kind = PyUnicode_KIND(self);
11540 data = PyUnicode_DATA(self);
11541
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 if (length == 1) {
11544 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11545 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11546 (Py_UNICODE_ISUPPER(ch) != 0));
11547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011549 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011552
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 cased = 0;
11554 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 for (i = 0; i < length; i++) {
11556 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011557
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11559 if (previous_is_cased)
11560 return PyBool_FromLong(0);
11561 previous_is_cased = 1;
11562 cased = 1;
11563 }
11564 else if (Py_UNICODE_ISLOWER(ch)) {
11565 if (!previous_is_cased)
11566 return PyBool_FromLong(0);
11567 previous_is_cased = 1;
11568 cased = 1;
11569 }
11570 else
11571 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011573 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574}
11575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011576PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011579Return True if all characters in S are whitespace\n\
11580and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
11582static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011583unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585 Py_ssize_t i, length;
11586 int kind;
11587 void *data;
11588
11589 if (PyUnicode_READY(self) == -1)
11590 return NULL;
11591 length = PyUnicode_GET_LENGTH(self);
11592 kind = PyUnicode_KIND(self);
11593 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 if (length == 1)
11597 return PyBool_FromLong(
11598 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011600 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 for (i = 0; i < length; i++) {
11605 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011606 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011609 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610}
11611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011614\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011615Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011617
11618static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011619unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 Py_ssize_t i, length;
11622 int kind;
11623 void *data;
11624
11625 if (PyUnicode_READY(self) == -1)
11626 return NULL;
11627 length = PyUnicode_GET_LENGTH(self);
11628 kind = PyUnicode_KIND(self);
11629 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011630
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011631 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (length == 1)
11633 return PyBool_FromLong(
11634 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011635
11636 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 for (i = 0; i < length; i++) {
11641 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011643 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011644 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011645}
11646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011647PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011649\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011650Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011651and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011652
11653static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011654unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011655{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 int kind;
11657 void *data;
11658 Py_ssize_t len, i;
11659
11660 if (PyUnicode_READY(self) == -1)
11661 return NULL;
11662
11663 kind = PyUnicode_KIND(self);
11664 data = PyUnicode_DATA(self);
11665 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011666
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011667 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 if (len == 1) {
11669 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11670 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11671 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011672
11673 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 for (i = 0; i < len; i++) {
11678 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011679 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011681 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011682 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011683}
11684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011685PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011686 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011688Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011689False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690
11691static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011692unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 Py_ssize_t i, length;
11695 int kind;
11696 void *data;
11697
11698 if (PyUnicode_READY(self) == -1)
11699 return NULL;
11700 length = PyUnicode_GET_LENGTH(self);
11701 kind = PyUnicode_KIND(self);
11702 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 if (length == 1)
11706 return PyBool_FromLong(
11707 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011709 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 for (i = 0; i < length; i++) {
11714 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011717 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718}
11719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011720PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011723Return True if all characters in S are digits\n\
11724and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725
11726static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011727unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 Py_ssize_t i, length;
11730 int kind;
11731 void *data;
11732
11733 if (PyUnicode_READY(self) == -1)
11734 return NULL;
11735 length = PyUnicode_GET_LENGTH(self);
11736 kind = PyUnicode_KIND(self);
11737 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 if (length == 1) {
11741 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11742 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011745 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 for (i = 0; i < length; i++) {
11750 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011753 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754}
11755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011756PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011759Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011760False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761
11762static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011763unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 Py_ssize_t i, length;
11766 int kind;
11767 void *data;
11768
11769 if (PyUnicode_READY(self) == -1)
11770 return NULL;
11771 length = PyUnicode_GET_LENGTH(self);
11772 kind = PyUnicode_KIND(self);
11773 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (length == 1)
11777 return PyBool_FromLong(
11778 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011780 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 for (i = 0; i < length; i++) {
11785 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011788 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789}
11790
Martin v. Löwis47383402007-08-15 07:32:56 +000011791int
11792PyUnicode_IsIdentifier(PyObject *self)
11793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 int kind;
11795 void *data;
11796 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011797 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (PyUnicode_READY(self) == -1) {
11800 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 }
11803
11804 /* Special case for empty strings */
11805 if (PyUnicode_GET_LENGTH(self) == 0)
11806 return 0;
11807 kind = PyUnicode_KIND(self);
11808 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011809
11810 /* PEP 3131 says that the first character must be in
11811 XID_Start and subsequent characters in XID_Continue,
11812 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011813 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011814 letters, digits, underscore). However, given the current
11815 definition of XID_Start and XID_Continue, it is sufficient
11816 to check just for these, except that _ must be allowed
11817 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011819 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011820 return 0;
11821
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011822 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011825 return 1;
11826}
11827
11828PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011830\n\
11831Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011832to the language definition.\n\
11833\n\
11834Use keyword.iskeyword() to test for reserved identifiers\n\
11835such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011836
11837static PyObject*
11838unicode_isidentifier(PyObject *self)
11839{
11840 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11841}
11842
Georg Brandl559e5d72008-06-11 18:37:52 +000011843PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011844 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011845\n\
11846Return True if all characters in S are considered\n\
11847printable in repr() or S is empty, False otherwise.");
11848
11849static PyObject*
11850unicode_isprintable(PyObject *self)
11851{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 Py_ssize_t i, length;
11853 int kind;
11854 void *data;
11855
11856 if (PyUnicode_READY(self) == -1)
11857 return NULL;
11858 length = PyUnicode_GET_LENGTH(self);
11859 kind = PyUnicode_KIND(self);
11860 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011861
11862 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 if (length == 1)
11864 return PyBool_FromLong(
11865 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 for (i = 0; i < length; i++) {
11868 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011869 Py_RETURN_FALSE;
11870 }
11871 }
11872 Py_RETURN_TRUE;
11873}
11874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011875PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011876 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877\n\
11878Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011879iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880
11881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011882unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011884 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885}
11886
Martin v. Löwis18e16552006-02-15 17:27:45 +000011887static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011888unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 if (PyUnicode_READY(self) == -1)
11891 return -1;
11892 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893}
11894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011895PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011898Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011899done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
11901static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011902unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011904 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 Py_UCS4 fillchar = ' ';
11906
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011907 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 return NULL;
11909
Benjamin Petersonbac79492012-01-14 13:34:47 -050011910 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
Victor Stinnerc4b49542011-12-11 22:44:26 +010011913 if (PyUnicode_GET_LENGTH(self) >= width)
11914 return unicode_result_unchanged(self);
11915
11916 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917}
11918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011919PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011922Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
11924static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011925unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011927 if (PyUnicode_READY(self) == -1)
11928 return NULL;
11929 if (PyUnicode_IS_ASCII(self))
11930 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011931 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932}
11933
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011934#define LEFTSTRIP 0
11935#define RIGHTSTRIP 1
11936#define BOTHSTRIP 2
11937
11938/* Arrays indexed by above */
11939static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11940
11941#define STRIPNAME(i) (stripformat[i]+3)
11942
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011943/* externally visible for str.strip(unicode) */
11944PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011945_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 void *data;
11948 int kind;
11949 Py_ssize_t i, j, len;
11950 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011951 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11954 return NULL;
11955
11956 kind = PyUnicode_KIND(self);
11957 data = PyUnicode_DATA(self);
11958 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011959 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11961 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011962 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011963
Benjamin Peterson14339b62009-01-31 16:36:08 +000011964 i = 0;
11965 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011966 while (i < len) {
11967 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11968 if (!BLOOM(sepmask, ch))
11969 break;
11970 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11971 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 i++;
11973 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011974 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011975
Benjamin Peterson14339b62009-01-31 16:36:08 +000011976 j = len;
11977 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011978 j--;
11979 while (j >= i) {
11980 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11981 if (!BLOOM(sepmask, ch))
11982 break;
11983 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11984 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011986 }
11987
Benjamin Peterson29060642009-01-31 22:14:21 +000011988 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011989 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011990
Victor Stinner7931d9a2011-11-04 00:22:48 +010011991 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992}
11993
11994PyObject*
11995PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11996{
11997 unsigned char *data;
11998 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011999 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000
Victor Stinnerde636f32011-10-01 03:55:54 +020012001 if (PyUnicode_READY(self) == -1)
12002 return NULL;
12003
Victor Stinner684d5fd2012-05-03 02:32:34 +020012004 length = PyUnicode_GET_LENGTH(self);
12005 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012006
Victor Stinner684d5fd2012-05-03 02:32:34 +020012007 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012008 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009
Victor Stinnerde636f32011-10-01 03:55:54 +020012010 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012011 PyErr_SetString(PyExc_IndexError, "string index out of range");
12012 return NULL;
12013 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012014 if (start >= length || end < start)
12015 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012016
Victor Stinner684d5fd2012-05-03 02:32:34 +020012017 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012018 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012019 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012020 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012021 }
12022 else {
12023 kind = PyUnicode_KIND(self);
12024 data = PyUnicode_1BYTE_DATA(self);
12025 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012026 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012027 length);
12028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030
12031static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012032do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 Py_ssize_t len, i, j;
12035
12036 if (PyUnicode_READY(self) == -1)
12037 return NULL;
12038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012040
Victor Stinnercc7af722013-04-09 22:39:24 +020012041 if (PyUnicode_IS_ASCII(self)) {
12042 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12043
12044 i = 0;
12045 if (striptype != RIGHTSTRIP) {
12046 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012047 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012048 if (!_Py_ascii_whitespace[ch])
12049 break;
12050 i++;
12051 }
12052 }
12053
12054 j = len;
12055 if (striptype != LEFTSTRIP) {
12056 j--;
12057 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012058 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012059 if (!_Py_ascii_whitespace[ch])
12060 break;
12061 j--;
12062 }
12063 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012064 }
12065 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012066 else {
12067 int kind = PyUnicode_KIND(self);
12068 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012069
Victor Stinnercc7af722013-04-09 22:39:24 +020012070 i = 0;
12071 if (striptype != RIGHTSTRIP) {
12072 while (i < len) {
12073 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12074 if (!Py_UNICODE_ISSPACE(ch))
12075 break;
12076 i++;
12077 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012078 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012079
12080 j = len;
12081 if (striptype != LEFTSTRIP) {
12082 j--;
12083 while (j >= i) {
12084 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12085 if (!Py_UNICODE_ISSPACE(ch))
12086 break;
12087 j--;
12088 }
12089 j++;
12090 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012091 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012092
Victor Stinner7931d9a2011-11-04 00:22:48 +010012093 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094}
12095
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012096
12097static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012098do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012099{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012100 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012101
Serhiy Storchakac6792272013-10-19 21:03:34 +030012102 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012103 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012104
Benjamin Peterson14339b62009-01-31 16:36:08 +000012105 if (sep != NULL && sep != Py_None) {
12106 if (PyUnicode_Check(sep))
12107 return _PyUnicode_XStrip(self, striptype, sep);
12108 else {
12109 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 "%s arg must be None or str",
12111 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012112 return NULL;
12113 }
12114 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012115
Benjamin Peterson14339b62009-01-31 16:36:08 +000012116 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012117}
12118
12119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012120PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012121 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012122\n\
12123Return a copy of the string S with leading and trailing\n\
12124whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012125If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012126
12127static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012128unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012129{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012130 if (PyTuple_GET_SIZE(args) == 0)
12131 return do_strip(self, BOTHSTRIP); /* Common case */
12132 else
12133 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012134}
12135
12136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012137PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139\n\
12140Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012141If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012142
12143static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012144unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012145{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012146 if (PyTuple_GET_SIZE(args) == 0)
12147 return do_strip(self, LEFTSTRIP); /* Common case */
12148 else
12149 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012150}
12151
12152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012153PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012155\n\
12156Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012157If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012158
12159static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012160unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012161{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012162 if (PyTuple_GET_SIZE(args) == 0)
12163 return do_strip(self, RIGHTSTRIP); /* Common case */
12164 else
12165 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166}
12167
12168
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012170unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012172 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
Serhiy Storchaka05997252013-01-26 12:14:02 +020012175 if (len < 1)
12176 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177
Victor Stinnerc4b49542011-12-11 22:44:26 +010012178 /* no repeat, return original string */
12179 if (len == 1)
12180 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012181
Benjamin Petersonbac79492012-01-14 13:34:47 -050012182 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 return NULL;
12184
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012185 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012186 PyErr_SetString(PyExc_OverflowError,
12187 "repeated string is too long");
12188 return NULL;
12189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012191
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012192 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 if (!u)
12194 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012195 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (PyUnicode_GET_LENGTH(str) == 1) {
12198 const int kind = PyUnicode_KIND(str);
12199 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012200 if (kind == PyUnicode_1BYTE_KIND) {
12201 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012202 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012203 }
12204 else if (kind == PyUnicode_2BYTE_KIND) {
12205 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012206 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012207 ucs2[n] = fill_char;
12208 } else {
12209 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12210 assert(kind == PyUnicode_4BYTE_KIND);
12211 for (n = 0; n < len; ++n)
12212 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012213 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 }
12215 else {
12216 /* number of characters copied this far */
12217 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012218 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 char *to = (char *) PyUnicode_DATA(u);
12220 Py_MEMCPY(to, PyUnicode_DATA(str),
12221 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012222 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 n = (done <= nchars-done) ? done : nchars-done;
12224 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012225 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 }
12228
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012229 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012230 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231}
12232
Alexander Belopolsky40018472011-02-26 01:02:56 +000012233PyObject *
12234PyUnicode_Replace(PyObject *obj,
12235 PyObject *subobj,
12236 PyObject *replobj,
12237 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238{
12239 PyObject *self;
12240 PyObject *str1;
12241 PyObject *str2;
12242 PyObject *result;
12243
12244 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012245 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012248 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012249 Py_DECREF(self);
12250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 }
12252 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012253 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 Py_DECREF(self);
12255 Py_DECREF(str1);
12256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012258 if (PyUnicode_READY(self) == -1 ||
12259 PyUnicode_READY(str1) == -1 ||
12260 PyUnicode_READY(str2) == -1)
12261 result = NULL;
12262 else
12263 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 Py_DECREF(self);
12265 Py_DECREF(str1);
12266 Py_DECREF(str2);
12267 return result;
12268}
12269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012270PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012271 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272\n\
12273Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012274old replaced by new. If the optional argument count is\n\
12275given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276
12277static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 PyObject *str1;
12281 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012282 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 PyObject *result;
12284
Martin v. Löwis18e16552006-02-15 17:27:45 +000012285 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012287 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012290 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 return NULL;
12292 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012293 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 Py_DECREF(str1);
12295 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012296 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012297 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12298 result = NULL;
12299 else
12300 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301
12302 Py_DECREF(str1);
12303 Py_DECREF(str2);
12304 return result;
12305}
12306
Alexander Belopolsky40018472011-02-26 01:02:56 +000012307static PyObject *
12308unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012310 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 Py_ssize_t isize;
12312 Py_ssize_t osize, squote, dquote, i, o;
12313 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012314 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012318 return NULL;
12319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 isize = PyUnicode_GET_LENGTH(unicode);
12321 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 /* Compute length of output, quote characters, and
12324 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012325 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 max = 127;
12327 squote = dquote = 0;
12328 ikind = PyUnicode_KIND(unicode);
12329 for (i = 0; i < isize; i++) {
12330 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12331 switch (ch) {
12332 case '\'': squote++; osize++; break;
12333 case '"': dquote++; osize++; break;
12334 case '\\': case '\t': case '\r': case '\n':
12335 osize += 2; break;
12336 default:
12337 /* Fast-path ASCII */
12338 if (ch < ' ' || ch == 0x7f)
12339 osize += 4; /* \xHH */
12340 else if (ch < 0x7f)
12341 osize++;
12342 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12343 osize++;
12344 max = ch > max ? ch : max;
12345 }
12346 else if (ch < 0x100)
12347 osize += 4; /* \xHH */
12348 else if (ch < 0x10000)
12349 osize += 6; /* \uHHHH */
12350 else
12351 osize += 10; /* \uHHHHHHHH */
12352 }
12353 }
12354
12355 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012356 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012358 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 if (dquote)
12360 /* Both squote and dquote present. Use squote,
12361 and escape them */
12362 osize += squote;
12363 else
12364 quote = '"';
12365 }
Victor Stinner55c08782013-04-14 18:45:39 +020012366 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367
12368 repr = PyUnicode_New(osize, max);
12369 if (repr == NULL)
12370 return NULL;
12371 okind = PyUnicode_KIND(repr);
12372 odata = PyUnicode_DATA(repr);
12373
12374 PyUnicode_WRITE(okind, odata, 0, quote);
12375 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012376 if (unchanged) {
12377 _PyUnicode_FastCopyCharacters(repr, 1,
12378 unicode, 0,
12379 isize);
12380 }
12381 else {
12382 for (i = 0, o = 1; i < isize; i++) {
12383 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384
Victor Stinner55c08782013-04-14 18:45:39 +020012385 /* Escape quotes and backslashes */
12386 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012387 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012389 continue;
12390 }
12391
12392 /* Map special whitespace to '\t', \n', '\r' */
12393 if (ch == '\t') {
12394 PyUnicode_WRITE(okind, odata, o++, '\\');
12395 PyUnicode_WRITE(okind, odata, o++, 't');
12396 }
12397 else if (ch == '\n') {
12398 PyUnicode_WRITE(okind, odata, o++, '\\');
12399 PyUnicode_WRITE(okind, odata, o++, 'n');
12400 }
12401 else if (ch == '\r') {
12402 PyUnicode_WRITE(okind, odata, o++, '\\');
12403 PyUnicode_WRITE(okind, odata, o++, 'r');
12404 }
12405
12406 /* Map non-printable US ASCII to '\xhh' */
12407 else if (ch < ' ' || ch == 0x7F) {
12408 PyUnicode_WRITE(okind, odata, o++, '\\');
12409 PyUnicode_WRITE(okind, odata, o++, 'x');
12410 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12411 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12412 }
12413
12414 /* Copy ASCII characters as-is */
12415 else if (ch < 0x7F) {
12416 PyUnicode_WRITE(okind, odata, o++, ch);
12417 }
12418
12419 /* Non-ASCII characters */
12420 else {
12421 /* Map Unicode whitespace and control characters
12422 (categories Z* and C* except ASCII space)
12423 */
12424 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12425 PyUnicode_WRITE(okind, odata, o++, '\\');
12426 /* Map 8-bit characters to '\xhh' */
12427 if (ch <= 0xff) {
12428 PyUnicode_WRITE(okind, odata, o++, 'x');
12429 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12430 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12431 }
12432 /* Map 16-bit characters to '\uxxxx' */
12433 else if (ch <= 0xffff) {
12434 PyUnicode_WRITE(okind, odata, o++, 'u');
12435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12437 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12438 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12439 }
12440 /* Map 21-bit characters to '\U00xxxxxx' */
12441 else {
12442 PyUnicode_WRITE(okind, odata, o++, 'U');
12443 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12444 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12445 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12446 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12447 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12448 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12449 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12450 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12451 }
12452 }
12453 /* Copy characters as-is */
12454 else {
12455 PyUnicode_WRITE(okind, odata, o++, ch);
12456 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012457 }
12458 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012461 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012462 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463}
12464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012465PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012466 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467\n\
12468Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012469such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470arguments start and end are interpreted as in slice notation.\n\
12471\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012472Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473
12474static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012477 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012478 Py_ssize_t start;
12479 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012480 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481
Jesus Ceaac451502011-04-20 17:09:23 +020012482 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12483 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485
Christian Heimesea71a522013-06-29 21:17:34 +020012486 if (PyUnicode_READY(self) == -1) {
12487 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012489 }
12490 if (PyUnicode_READY(substring) == -1) {
12491 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494
Victor Stinner7931d9a2011-11-04 00:22:48 +010012495 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496
12497 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 if (result == -2)
12500 return NULL;
12501
Christian Heimes217cfd12007-12-02 14:31:20 +000012502 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503}
12504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012505PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012508Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
12510static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012513 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012514 Py_ssize_t start;
12515 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012516 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517
Jesus Ceaac451502011-04-20 17:09:23 +020012518 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12519 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
Christian Heimesea71a522013-06-29 21:17:34 +020012522 if (PyUnicode_READY(self) == -1) {
12523 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012525 }
12526 if (PyUnicode_READY(substring) == -1) {
12527 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012529 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530
Victor Stinner7931d9a2011-11-04 00:22:48 +010012531 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
12533 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 if (result == -2)
12536 return NULL;
12537
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538 if (result < 0) {
12539 PyErr_SetString(PyExc_ValueError, "substring not found");
12540 return NULL;
12541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542
Christian Heimes217cfd12007-12-02 14:31:20 +000012543 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544}
12545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012546PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012547 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012549Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012550done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551
12552static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012553unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012555 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 Py_UCS4 fillchar = ' ';
12557
Victor Stinnere9a29352011-10-01 02:14:59 +020012558 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012560
Benjamin Petersonbac79492012-01-14 13:34:47 -050012561 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562 return NULL;
12563
Victor Stinnerc4b49542011-12-11 22:44:26 +010012564 if (PyUnicode_GET_LENGTH(self) >= width)
12565 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566
Victor Stinnerc4b49542011-12-11 22:44:26 +010012567 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568}
12569
Alexander Belopolsky40018472011-02-26 01:02:56 +000012570PyObject *
12571PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572{
12573 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012574
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575 s = PyUnicode_FromObject(s);
12576 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012577 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 if (sep != NULL) {
12579 sep = PyUnicode_FromObject(sep);
12580 if (sep == NULL) {
12581 Py_DECREF(s);
12582 return NULL;
12583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584 }
12585
Victor Stinner9310abb2011-10-05 00:59:23 +020012586 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587
12588 Py_DECREF(s);
12589 Py_XDECREF(sep);
12590 return result;
12591}
12592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012593PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012594 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595\n\
12596Return a list of the words in S, using sep as the\n\
12597delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012598splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012599whitespace string is a separator and empty strings are\n\
12600removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601
12602static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012603unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012605 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012607 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012609 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12610 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611 return NULL;
12612
12613 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012616 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012618 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619}
12620
Thomas Wouters477c8d52006-05-27 19:21:47 +000012621PyObject *
12622PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12623{
12624 PyObject* str_obj;
12625 PyObject* sep_obj;
12626 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 int kind1, kind2, kind;
12628 void *buf1 = NULL, *buf2 = NULL;
12629 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012630
12631 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012632 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012634 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012635 if (!sep_obj) {
12636 Py_DECREF(str_obj);
12637 return NULL;
12638 }
12639 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12640 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012641 Py_DECREF(str_obj);
12642 return NULL;
12643 }
12644
Victor Stinner14f8f022011-10-05 20:58:25 +020012645 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012647 kind = Py_MAX(kind1, kind2);
12648 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012650 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 if (!buf1)
12652 goto onError;
12653 buf2 = PyUnicode_DATA(sep_obj);
12654 if (kind2 != kind)
12655 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12656 if (!buf2)
12657 goto onError;
12658 len1 = PyUnicode_GET_LENGTH(str_obj);
12659 len2 = PyUnicode_GET_LENGTH(sep_obj);
12660
Benjamin Petersonead6b532011-12-20 17:23:42 -060012661 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012663 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12664 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12665 else
12666 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 break;
12668 case PyUnicode_2BYTE_KIND:
12669 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12670 break;
12671 case PyUnicode_4BYTE_KIND:
12672 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12673 break;
12674 default:
12675 assert(0);
12676 out = 0;
12677 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012678
12679 Py_DECREF(sep_obj);
12680 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 if (kind1 != kind)
12682 PyMem_Free(buf1);
12683 if (kind2 != kind)
12684 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012685
12686 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 onError:
12688 Py_DECREF(sep_obj);
12689 Py_DECREF(str_obj);
12690 if (kind1 != kind && buf1)
12691 PyMem_Free(buf1);
12692 if (kind2 != kind && buf2)
12693 PyMem_Free(buf2);
12694 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012695}
12696
12697
12698PyObject *
12699PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12700{
12701 PyObject* str_obj;
12702 PyObject* sep_obj;
12703 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 int kind1, kind2, kind;
12705 void *buf1 = NULL, *buf2 = NULL;
12706 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012707
12708 str_obj = PyUnicode_FromObject(str_in);
12709 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012710 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711 sep_obj = PyUnicode_FromObject(sep_in);
12712 if (!sep_obj) {
12713 Py_DECREF(str_obj);
12714 return NULL;
12715 }
12716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 kind1 = PyUnicode_KIND(str_in);
12718 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012719 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 buf1 = PyUnicode_DATA(str_in);
12721 if (kind1 != kind)
12722 buf1 = _PyUnicode_AsKind(str_in, kind);
12723 if (!buf1)
12724 goto onError;
12725 buf2 = PyUnicode_DATA(sep_obj);
12726 if (kind2 != kind)
12727 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12728 if (!buf2)
12729 goto onError;
12730 len1 = PyUnicode_GET_LENGTH(str_obj);
12731 len2 = PyUnicode_GET_LENGTH(sep_obj);
12732
Benjamin Petersonead6b532011-12-20 17:23:42 -060012733 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012735 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12736 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12737 else
12738 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 break;
12740 case PyUnicode_2BYTE_KIND:
12741 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12742 break;
12743 case PyUnicode_4BYTE_KIND:
12744 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12745 break;
12746 default:
12747 assert(0);
12748 out = 0;
12749 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012750
12751 Py_DECREF(sep_obj);
12752 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 if (kind1 != kind)
12754 PyMem_Free(buf1);
12755 if (kind2 != kind)
12756 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757
12758 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012759 onError:
12760 Py_DECREF(sep_obj);
12761 Py_DECREF(str_obj);
12762 if (kind1 != kind && buf1)
12763 PyMem_Free(buf1);
12764 if (kind2 != kind && buf2)
12765 PyMem_Free(buf2);
12766 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012767}
12768
12769PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012771\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012772Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012773the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012774found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012775
12776static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012777unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012778{
Victor Stinner9310abb2011-10-05 00:59:23 +020012779 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012780}
12781
12782PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012783 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012784\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012785Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012787separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012788
12789static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012790unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012791{
Victor Stinner9310abb2011-10-05 00:59:23 +020012792 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012793}
12794
Alexander Belopolsky40018472011-02-26 01:02:56 +000012795PyObject *
12796PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012797{
12798 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012799
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012800 s = PyUnicode_FromObject(s);
12801 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012802 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 if (sep != NULL) {
12804 sep = PyUnicode_FromObject(sep);
12805 if (sep == NULL) {
12806 Py_DECREF(s);
12807 return NULL;
12808 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012809 }
12810
Victor Stinner9310abb2011-10-05 00:59:23 +020012811 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012812
12813 Py_DECREF(s);
12814 Py_XDECREF(sep);
12815 return result;
12816}
12817
12818PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012819 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012820\n\
12821Return a list of the words in S, using sep as the\n\
12822delimiter string, starting at the end of the string and\n\
12823working to the front. If maxsplit is given, at most maxsplit\n\
12824splits are done. If sep is not specified, any whitespace string\n\
12825is a separator.");
12826
12827static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012828unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012829{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012830 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012831 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012832 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012833
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012834 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12835 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012836 return NULL;
12837
12838 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012839 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012840 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012841 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012842 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012843 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012844}
12845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012846PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848\n\
12849Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012850Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012851is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852
12853static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012854unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012856 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012857 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012859 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12860 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861 return NULL;
12862
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012863 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864}
12865
12866static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012867PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012869 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870}
12871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012872PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874\n\
12875Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012876and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877
12878static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012879unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012881 if (PyUnicode_READY(self) == -1)
12882 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012883 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884}
12885
Larry Hastings61272b72014-01-07 12:41:53 -080012886/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012887
Larry Hastings31826802013-10-19 00:09:25 -070012888@staticmethod
12889str.maketrans as unicode_maketrans
12890
12891 x: object
12892
12893 y: unicode=NULL
12894
12895 z: unicode=NULL
12896
12897 /
12898
12899Return a translation table usable for str.translate().
12900
12901If there is only one argument, it must be a dictionary mapping Unicode
12902ordinals (integers) or characters to Unicode ordinals, strings or None.
12903Character keys will be then converted to ordinals.
12904If there are two arguments, they must be strings of equal length, and
12905in the resulting dictionary, each character in x will be mapped to the
12906character at the same position in y. If there is a third argument, it
12907must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012908[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012909
12910PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012911"maketrans(x, y=None, z=None, /)\n"
12912"--\n"
12913"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012914"Return a translation table usable for str.translate().\n"
12915"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012916"If there is only one argument, it must be a dictionary mapping Unicode\n"
12917"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12918"Character keys will be then converted to ordinals.\n"
12919"If there are two arguments, they must be strings of equal length, and\n"
12920"in the resulting dictionary, each character in x will be mapped to the\n"
12921"character at the same position in y. If there is a third argument, it\n"
12922"must be a string, whose characters will be mapped to None in the result.");
12923
12924#define UNICODE_MAKETRANS_METHODDEF \
12925 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12926
12927static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012928unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012929
12930static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012931unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012932{
Larry Hastings31826802013-10-19 00:09:25 -070012933 PyObject *return_value = NULL;
12934 PyObject *x;
12935 PyObject *y = NULL;
12936 PyObject *z = NULL;
12937
12938 if (!PyArg_ParseTuple(args,
12939 "O|UU:maketrans",
12940 &x, &y, &z))
12941 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012942 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012943
12944exit:
12945 return return_value;
12946}
12947
12948static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012949unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012950/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012951{
Georg Brandlceee0772007-11-27 23:48:05 +000012952 PyObject *new = NULL, *key, *value;
12953 Py_ssize_t i = 0;
12954 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012955
Georg Brandlceee0772007-11-27 23:48:05 +000012956 new = PyDict_New();
12957 if (!new)
12958 return NULL;
12959 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 int x_kind, y_kind, z_kind;
12961 void *x_data, *y_data, *z_data;
12962
Georg Brandlceee0772007-11-27 23:48:05 +000012963 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012964 if (!PyUnicode_Check(x)) {
12965 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12966 "be a string if there is a second argument");
12967 goto err;
12968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012970 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12971 "arguments must have equal length");
12972 goto err;
12973 }
12974 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 x_kind = PyUnicode_KIND(x);
12976 y_kind = PyUnicode_KIND(y);
12977 x_data = PyUnicode_DATA(x);
12978 y_data = PyUnicode_DATA(y);
12979 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12980 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012981 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012982 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012983 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012984 if (!value) {
12985 Py_DECREF(key);
12986 goto err;
12987 }
Georg Brandlceee0772007-11-27 23:48:05 +000012988 res = PyDict_SetItem(new, key, value);
12989 Py_DECREF(key);
12990 Py_DECREF(value);
12991 if (res < 0)
12992 goto err;
12993 }
12994 /* create entries for deleting chars in z */
12995 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 z_kind = PyUnicode_KIND(z);
12997 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012998 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013000 if (!key)
13001 goto err;
13002 res = PyDict_SetItem(new, key, Py_None);
13003 Py_DECREF(key);
13004 if (res < 0)
13005 goto err;
13006 }
13007 }
13008 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 int kind;
13010 void *data;
13011
Georg Brandlceee0772007-11-27 23:48:05 +000013012 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013013 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013014 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13015 "to maketrans it must be a dict");
13016 goto err;
13017 }
13018 /* copy entries into the new dict, converting string keys to int keys */
13019 while (PyDict_Next(x, &i, &key, &value)) {
13020 if (PyUnicode_Check(key)) {
13021 /* convert string keys to integer keys */
13022 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013023 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013024 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13025 "table must be of length 1");
13026 goto err;
13027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 kind = PyUnicode_KIND(key);
13029 data = PyUnicode_DATA(key);
13030 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013031 if (!newkey)
13032 goto err;
13033 res = PyDict_SetItem(new, newkey, value);
13034 Py_DECREF(newkey);
13035 if (res < 0)
13036 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013037 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013038 /* just keep integer keys */
13039 if (PyDict_SetItem(new, key, value) < 0)
13040 goto err;
13041 } else {
13042 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13043 "be strings or integers");
13044 goto err;
13045 }
13046 }
13047 }
13048 return new;
13049 err:
13050 Py_DECREF(new);
13051 return NULL;
13052}
13053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013054PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056\n\
13057Return a copy of the string S, where all characters have been mapped\n\
13058through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013059Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013060Unmapped characters are left untouched. Characters mapped to None\n\
13061are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062
13063static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013064unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067}
13068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013069PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013072Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073
13074static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013075unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013077 if (PyUnicode_READY(self) == -1)
13078 return NULL;
13079 if (PyUnicode_IS_ASCII(self))
13080 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013081 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082}
13083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013084PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013087Pad a numeric string S with zeros on the left, to fill a field\n\
13088of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089
13090static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013091unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013093 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013094 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013095 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 int kind;
13097 void *data;
13098 Py_UCS4 chr;
13099
Martin v. Löwis18e16552006-02-15 17:27:45 +000013100 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101 return NULL;
13102
Benjamin Petersonbac79492012-01-14 13:34:47 -050013103 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013104 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105
Victor Stinnerc4b49542011-12-11 22:44:26 +010013106 if (PyUnicode_GET_LENGTH(self) >= width)
13107 return unicode_result_unchanged(self);
13108
13109 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
13111 u = pad(self, fill, 0, '0');
13112
Walter Dörwald068325e2002-04-15 13:36:47 +000013113 if (u == NULL)
13114 return NULL;
13115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013116 kind = PyUnicode_KIND(u);
13117 data = PyUnicode_DATA(u);
13118 chr = PyUnicode_READ(kind, data, fill);
13119
13120 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 PyUnicode_WRITE(kind, data, 0, chr);
13123 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124 }
13125
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013126 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013127 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
13130#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013131static PyObject *
13132unicode__decimal2ascii(PyObject *self)
13133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013135}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136#endif
13137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013138PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013141Return True if S starts with the specified prefix, False otherwise.\n\
13142With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013143With optional end, stop comparing S at that position.\n\
13144prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145
13146static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013147unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013150 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013151 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013152 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013153 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013154 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155
Jesus Ceaac451502011-04-20 17:09:23 +020013156 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 if (PyTuple_Check(subobj)) {
13159 Py_ssize_t i;
13160 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013161 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013162 if (substring == NULL)
13163 return NULL;
13164 result = tailmatch(self, substring, start, end, -1);
13165 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013166 if (result == -1)
13167 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013168 if (result) {
13169 Py_RETURN_TRUE;
13170 }
13171 }
13172 /* nothing matched */
13173 Py_RETURN_FALSE;
13174 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013175 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013176 if (substring == NULL) {
13177 if (PyErr_ExceptionMatches(PyExc_TypeError))
13178 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13179 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013181 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013182 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013184 if (result == -1)
13185 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013186 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187}
13188
13189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013190PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013192\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013193Return True if S ends with the specified suffix, False otherwise.\n\
13194With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013195With optional end, stop comparing S at that position.\n\
13196suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197
13198static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013199unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013202 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013203 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013204 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013205 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013206 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207
Jesus Ceaac451502011-04-20 17:09:23 +020013208 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013209 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013210 if (PyTuple_Check(subobj)) {
13211 Py_ssize_t i;
13212 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013213 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013214 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013215 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013217 result = tailmatch(self, substring, start, end, +1);
13218 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013219 if (result == -1)
13220 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013221 if (result) {
13222 Py_RETURN_TRUE;
13223 }
13224 }
13225 Py_RETURN_FALSE;
13226 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013227 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013228 if (substring == NULL) {
13229 if (PyErr_ExceptionMatches(PyExc_TypeError))
13230 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13231 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013232 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013233 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013234 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013235 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013236 if (result == -1)
13237 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013238 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239}
13240
Victor Stinner202fdca2012-05-07 12:47:02 +020013241Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013242_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013243{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013244 if (!writer->readonly)
13245 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13246 else {
13247 /* Copy-on-write mode: set buffer size to 0 so
13248 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13249 * next write. */
13250 writer->size = 0;
13251 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013252 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13253 writer->data = PyUnicode_DATA(writer->buffer);
13254 writer->kind = PyUnicode_KIND(writer->buffer);
13255}
13256
Victor Stinnerd3f08822012-05-29 12:57:52 +020013257void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013258_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013259{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013260 memset(writer, 0, sizeof(*writer));
13261#ifdef Py_DEBUG
13262 writer->kind = 5; /* invalid kind */
13263#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013264 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013265}
13266
Victor Stinnerd3f08822012-05-29 12:57:52 +020013267int
13268_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13269 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013270{
Victor Stinner6989ba02013-11-18 21:08:39 +010013271#ifdef MS_WINDOWS
13272 /* On Windows, overallocate by 50% is the best factor */
13273# define OVERALLOCATE_FACTOR 2
13274#else
13275 /* On Linux, overallocate by 25% is the best factor */
13276# define OVERALLOCATE_FACTOR 4
13277#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013278 Py_ssize_t newlen;
13279 PyObject *newbuffer;
13280
Victor Stinnerd3f08822012-05-29 12:57:52 +020013281 assert(length > 0);
13282
Victor Stinner202fdca2012-05-07 12:47:02 +020013283 if (length > PY_SSIZE_T_MAX - writer->pos) {
13284 PyErr_NoMemory();
13285 return -1;
13286 }
13287 newlen = writer->pos + length;
13288
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013289 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013290
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013292 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013293 if (writer->overallocate
13294 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13295 /* overallocate to limit the number of realloc() */
13296 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013297 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013298 if (newlen < writer->min_length)
13299 newlen = writer->min_length;
13300
Victor Stinnerd3f08822012-05-29 12:57:52 +020013301 writer->buffer = PyUnicode_New(newlen, maxchar);
13302 if (writer->buffer == NULL)
13303 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013304 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013305 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013306 if (writer->overallocate
13307 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13308 /* overallocate to limit the number of realloc() */
13309 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013310 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013311 if (newlen < writer->min_length)
13312 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013313
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013314 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013315 /* resize + widen */
13316 newbuffer = PyUnicode_New(newlen, maxchar);
13317 if (newbuffer == NULL)
13318 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013319 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13320 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013321 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013322 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013323 }
13324 else {
13325 newbuffer = resize_compact(writer->buffer, newlen);
13326 if (newbuffer == NULL)
13327 return -1;
13328 }
13329 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013330 }
13331 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013332 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013333 newbuffer = PyUnicode_New(writer->size, maxchar);
13334 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013335 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013336 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13337 writer->buffer, 0, writer->pos);
13338 Py_DECREF(writer->buffer);
13339 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013340 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013341 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013342 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013343
13344#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013345}
13346
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013347Py_LOCAL_INLINE(int)
13348_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013349{
13350 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13351 return -1;
13352 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13353 writer->pos++;
13354 return 0;
13355}
13356
13357int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013358_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13359{
13360 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13361}
13362
13363int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013364_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13365{
13366 Py_UCS4 maxchar;
13367 Py_ssize_t len;
13368
13369 if (PyUnicode_READY(str) == -1)
13370 return -1;
13371 len = PyUnicode_GET_LENGTH(str);
13372 if (len == 0)
13373 return 0;
13374 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13375 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013376 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013377 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013378 Py_INCREF(str);
13379 writer->buffer = str;
13380 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013381 writer->pos += len;
13382 return 0;
13383 }
13384 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13385 return -1;
13386 }
13387 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13388 str, 0, len);
13389 writer->pos += len;
13390 return 0;
13391}
13392
Victor Stinnere215d962012-10-06 23:03:36 +020013393int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013394_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13395 Py_ssize_t start, Py_ssize_t end)
13396{
13397 Py_UCS4 maxchar;
13398 Py_ssize_t len;
13399
13400 if (PyUnicode_READY(str) == -1)
13401 return -1;
13402
13403 assert(0 <= start);
13404 assert(end <= PyUnicode_GET_LENGTH(str));
13405 assert(start <= end);
13406
13407 if (end == 0)
13408 return 0;
13409
13410 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13411 return _PyUnicodeWriter_WriteStr(writer, str);
13412
13413 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13414 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13415 else
13416 maxchar = writer->maxchar;
13417 len = end - start;
13418
13419 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13420 return -1;
13421
13422 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13423 str, start, len);
13424 writer->pos += len;
13425 return 0;
13426}
13427
13428int
Victor Stinner4a587072013-11-19 12:54:53 +010013429_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13430 const char *ascii, Py_ssize_t len)
13431{
13432 if (len == -1)
13433 len = strlen(ascii);
13434
13435 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13436
13437 if (writer->buffer == NULL && !writer->overallocate) {
13438 PyObject *str;
13439
13440 str = _PyUnicode_FromASCII(ascii, len);
13441 if (str == NULL)
13442 return -1;
13443
13444 writer->readonly = 1;
13445 writer->buffer = str;
13446 _PyUnicodeWriter_Update(writer);
13447 writer->pos += len;
13448 return 0;
13449 }
13450
13451 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13452 return -1;
13453
13454 switch (writer->kind)
13455 {
13456 case PyUnicode_1BYTE_KIND:
13457 {
13458 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13459 Py_UCS1 *data = writer->data;
13460
13461 Py_MEMCPY(data + writer->pos, str, len);
13462 break;
13463 }
13464 case PyUnicode_2BYTE_KIND:
13465 {
13466 _PyUnicode_CONVERT_BYTES(
13467 Py_UCS1, Py_UCS2,
13468 ascii, ascii + len,
13469 (Py_UCS2 *)writer->data + writer->pos);
13470 break;
13471 }
13472 case PyUnicode_4BYTE_KIND:
13473 {
13474 _PyUnicode_CONVERT_BYTES(
13475 Py_UCS1, Py_UCS4,
13476 ascii, ascii + len,
13477 (Py_UCS4 *)writer->data + writer->pos);
13478 break;
13479 }
13480 default:
13481 assert(0);
13482 }
13483
13484 writer->pos += len;
13485 return 0;
13486}
13487
13488int
13489_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13490 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013491{
13492 Py_UCS4 maxchar;
13493
13494 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13495 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13496 return -1;
13497 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13498 writer->pos += len;
13499 return 0;
13500}
13501
Victor Stinnerd3f08822012-05-29 12:57:52 +020013502PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013503_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013504{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013505 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013507 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013508 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013509 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013510 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013511 str = writer->buffer;
13512 writer->buffer = NULL;
13513 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13514 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013515 }
13516 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13517 PyObject *newbuffer;
13518 newbuffer = resize_compact(writer->buffer, writer->pos);
13519 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013520 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013521 return NULL;
13522 }
13523 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013524 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013525 str = writer->buffer;
13526 writer->buffer = NULL;
13527 assert(_PyUnicode_CheckConsistency(str, 1));
13528 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013529}
13530
Victor Stinnerd3f08822012-05-29 12:57:52 +020013531void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013532_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013533{
13534 Py_CLEAR(writer->buffer);
13535}
13536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013537#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013538
13539PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013540 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013541\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013542Return a formatted version of S, using substitutions from args and kwargs.\n\
13543The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013544
Eric Smith27bbca62010-11-04 17:06:58 +000013545PyDoc_STRVAR(format_map__doc__,
13546 "S.format_map(mapping) -> str\n\
13547\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013548Return a formatted version of S, using substitutions from mapping.\n\
13549The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013550
Eric Smith4a7d76d2008-05-30 18:10:19 +000013551static PyObject *
13552unicode__format__(PyObject* self, PyObject* args)
13553{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013554 PyObject *format_spec;
13555 _PyUnicodeWriter writer;
13556 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013557
13558 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13559 return NULL;
13560
Victor Stinnerd3f08822012-05-29 12:57:52 +020013561 if (PyUnicode_READY(self) == -1)
13562 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013563 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013564 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13565 self, format_spec, 0,
13566 PyUnicode_GET_LENGTH(format_spec));
13567 if (ret == -1) {
13568 _PyUnicodeWriter_Dealloc(&writer);
13569 return NULL;
13570 }
13571 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013572}
13573
Eric Smith8c663262007-08-25 02:26:07 +000013574PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013576\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013577Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013578
13579static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013580unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013582 Py_ssize_t size;
13583
13584 /* If it's a compact object, account for base structure +
13585 character data. */
13586 if (PyUnicode_IS_COMPACT_ASCII(v))
13587 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13588 else if (PyUnicode_IS_COMPACT(v))
13589 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013590 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013591 else {
13592 /* If it is a two-block object, account for base object, and
13593 for character block if present. */
13594 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013595 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013597 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013598 }
13599 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013600 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013601 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013603 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013604 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013605
13606 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013607}
13608
13609PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013610 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013611
13612static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013613unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013614{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013615 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013616 if (!copy)
13617 return NULL;
13618 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013619}
13620
Guido van Rossumd57fd912000-03-10 22:53:23 +000013621static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013622 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013623 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013624 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13625 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013626 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13627 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013628 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013629 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13630 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13631 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013632 {"expandtabs", (PyCFunction) unicode_expandtabs,
13633 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013634 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013635 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013636 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13637 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13638 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013639 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013640 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13641 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13642 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013643 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013644 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013645 {"splitlines", (PyCFunction) unicode_splitlines,
13646 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013647 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013648 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13649 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13650 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13651 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13652 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13653 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13654 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13655 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13656 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13657 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13658 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13659 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13660 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13661 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013662 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013663 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013664 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013665 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013666 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013667 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013668 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013669 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013670#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013671 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013672 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013673#endif
13674
Benjamin Peterson14339b62009-01-31 16:36:08 +000013675 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013676 {NULL, NULL}
13677};
13678
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013679static PyObject *
13680unicode_mod(PyObject *v, PyObject *w)
13681{
Brian Curtindfc80e32011-08-10 20:28:54 -050013682 if (!PyUnicode_Check(v))
13683 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013684 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013685}
13686
13687static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013688 0, /*nb_add*/
13689 0, /*nb_subtract*/
13690 0, /*nb_multiply*/
13691 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013692};
13693
Guido van Rossumd57fd912000-03-10 22:53:23 +000013694static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013695 (lenfunc) unicode_length, /* sq_length */
13696 PyUnicode_Concat, /* sq_concat */
13697 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13698 (ssizeargfunc) unicode_getitem, /* sq_item */
13699 0, /* sq_slice */
13700 0, /* sq_ass_item */
13701 0, /* sq_ass_slice */
13702 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013703};
13704
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013705static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013706unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013707{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013708 if (PyUnicode_READY(self) == -1)
13709 return NULL;
13710
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013711 if (PyIndex_Check(item)) {
13712 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013713 if (i == -1 && PyErr_Occurred())
13714 return NULL;
13715 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013716 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013717 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013718 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013719 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013720 PyObject *result;
13721 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013722 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013723 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013725 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013727 return NULL;
13728 }
13729
13730 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013731 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013732 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013733 slicelength == PyUnicode_GET_LENGTH(self)) {
13734 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013735 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013736 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013737 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013738 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013739 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013740 src_kind = PyUnicode_KIND(self);
13741 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013742 if (!PyUnicode_IS_ASCII(self)) {
13743 kind_limit = kind_maxchar_limit(src_kind);
13744 max_char = 0;
13745 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13746 ch = PyUnicode_READ(src_kind, src_data, cur);
13747 if (ch > max_char) {
13748 max_char = ch;
13749 if (max_char >= kind_limit)
13750 break;
13751 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013752 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013753 }
Victor Stinner55c99112011-10-13 01:17:06 +020013754 else
13755 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013756 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013757 if (result == NULL)
13758 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013759 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013760 dest_data = PyUnicode_DATA(result);
13761
13762 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013763 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13764 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013765 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013766 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013767 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013768 } else {
13769 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13770 return NULL;
13771 }
13772}
13773
13774static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013775 (lenfunc)unicode_length, /* mp_length */
13776 (binaryfunc)unicode_subscript, /* mp_subscript */
13777 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013778};
13779
Guido van Rossumd57fd912000-03-10 22:53:23 +000013780
Guido van Rossumd57fd912000-03-10 22:53:23 +000013781/* Helpers for PyUnicode_Format() */
13782
Victor Stinnera47082312012-10-04 02:19:54 +020013783struct unicode_formatter_t {
13784 PyObject *args;
13785 int args_owned;
13786 Py_ssize_t arglen, argidx;
13787 PyObject *dict;
13788
13789 enum PyUnicode_Kind fmtkind;
13790 Py_ssize_t fmtcnt, fmtpos;
13791 void *fmtdata;
13792 PyObject *fmtstr;
13793
13794 _PyUnicodeWriter writer;
13795};
13796
13797struct unicode_format_arg_t {
13798 Py_UCS4 ch;
13799 int flags;
13800 Py_ssize_t width;
13801 int prec;
13802 int sign;
13803};
13804
Guido van Rossumd57fd912000-03-10 22:53:23 +000013805static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013806unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807{
Victor Stinnera47082312012-10-04 02:19:54 +020013808 Py_ssize_t argidx = ctx->argidx;
13809
13810 if (argidx < ctx->arglen) {
13811 ctx->argidx++;
13812 if (ctx->arglen < 0)
13813 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 else
Victor Stinnera47082312012-10-04 02:19:54 +020013815 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013816 }
13817 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013818 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819 return NULL;
13820}
13821
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013822/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823
Victor Stinnera47082312012-10-04 02:19:54 +020013824/* Format a float into the writer if the writer is not NULL, or into *p_output
13825 otherwise.
13826
13827 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013828static int
Victor Stinnera47082312012-10-04 02:19:54 +020013829formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13830 PyObject **p_output,
13831 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013833 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013834 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013835 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013836 int prec;
13837 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013838
Guido van Rossumd57fd912000-03-10 22:53:23 +000013839 x = PyFloat_AsDouble(v);
13840 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013842
Victor Stinnera47082312012-10-04 02:19:54 +020013843 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013844 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013845 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013846
Victor Stinnera47082312012-10-04 02:19:54 +020013847 if (arg->flags & F_ALT)
13848 dtoa_flags = Py_DTSF_ALT;
13849 else
13850 dtoa_flags = 0;
13851 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013852 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013853 return -1;
13854 len = strlen(p);
13855 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013856 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013857 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013858 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013859 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013860 }
13861 else
13862 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013863 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013864 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013865}
13866
Victor Stinnerd0880d52012-04-27 23:40:13 +020013867/* formatlong() emulates the format codes d, u, o, x and X, and
13868 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13869 * Python's regular ints.
13870 * Return value: a new PyUnicodeObject*, or NULL if error.
13871 * The output string is of the form
13872 * "-"? ("0x" | "0X")? digit+
13873 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13874 * set in flags. The case of hex digits will be correct,
13875 * There will be at least prec digits, zero-filled on the left if
13876 * necessary to get that many.
13877 * val object to be converted
13878 * flags bitmask of format flags; only F_ALT is looked at
13879 * prec minimum number of digits; 0-fill on left if needed
13880 * type a character in [duoxX]; u acts the same as d
13881 *
13882 * CAUTION: o, x and X conversions on regular ints can never
13883 * produce a '-' sign, but can for Python's unbounded ints.
13884 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013885static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013886formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013887{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013888 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013889 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013890 Py_ssize_t i;
13891 int sign; /* 1 if '-', else 0 */
13892 int len; /* number of characters */
13893 Py_ssize_t llen;
13894 int numdigits; /* len == numnondigits + numdigits */
13895 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013896 int prec = arg->prec;
13897 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013898
Victor Stinnerd0880d52012-04-27 23:40:13 +020013899 /* Avoid exceeding SSIZE_T_MAX */
13900 if (prec > INT_MAX-3) {
13901 PyErr_SetString(PyExc_OverflowError,
13902 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013903 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013904 }
13905
13906 assert(PyLong_Check(val));
13907
13908 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013909 default:
13910 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013911 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013912 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013913 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013914 /* int and int subclasses should print numerically when a numeric */
13915 /* format code is used (see issue18780) */
13916 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013917 break;
13918 case 'o':
13919 numnondigits = 2;
13920 result = PyNumber_ToBase(val, 8);
13921 break;
13922 case 'x':
13923 case 'X':
13924 numnondigits = 2;
13925 result = PyNumber_ToBase(val, 16);
13926 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013927 }
13928 if (!result)
13929 return NULL;
13930
13931 assert(unicode_modifiable(result));
13932 assert(PyUnicode_IS_READY(result));
13933 assert(PyUnicode_IS_ASCII(result));
13934
13935 /* To modify the string in-place, there can only be one reference. */
13936 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013937 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013938 PyErr_BadInternalCall();
13939 return NULL;
13940 }
13941 buf = PyUnicode_DATA(result);
13942 llen = PyUnicode_GET_LENGTH(result);
13943 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013944 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013945 PyErr_SetString(PyExc_ValueError,
13946 "string too large in _PyBytes_FormatLong");
13947 return NULL;
13948 }
13949 len = (int)llen;
13950 sign = buf[0] == '-';
13951 numnondigits += sign;
13952 numdigits = len - numnondigits;
13953 assert(numdigits > 0);
13954
13955 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013956 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013957 (type == 'o' || type == 'x' || type == 'X'))) {
13958 assert(buf[sign] == '0');
13959 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13960 buf[sign+1] == 'o');
13961 numnondigits -= 2;
13962 buf += 2;
13963 len -= 2;
13964 if (sign)
13965 buf[0] = '-';
13966 assert(len == numnondigits + numdigits);
13967 assert(numdigits > 0);
13968 }
13969
13970 /* Fill with leading zeroes to meet minimum width. */
13971 if (prec > numdigits) {
13972 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13973 numnondigits + prec);
13974 char *b1;
13975 if (!r1) {
13976 Py_DECREF(result);
13977 return NULL;
13978 }
13979 b1 = PyBytes_AS_STRING(r1);
13980 for (i = 0; i < numnondigits; ++i)
13981 *b1++ = *buf++;
13982 for (i = 0; i < prec - numdigits; i++)
13983 *b1++ = '0';
13984 for (i = 0; i < numdigits; i++)
13985 *b1++ = *buf++;
13986 *b1 = '\0';
13987 Py_DECREF(result);
13988 result = r1;
13989 buf = PyBytes_AS_STRING(result);
13990 len = numnondigits + prec;
13991 }
13992
13993 /* Fix up case for hex conversions. */
13994 if (type == 'X') {
13995 /* Need to convert all lower case letters to upper case.
13996 and need to convert 0x to 0X (and -0x to -0X). */
13997 for (i = 0; i < len; i++)
13998 if (buf[i] >= 'a' && buf[i] <= 'x')
13999 buf[i] -= 'a'-'A';
14000 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014001 if (!PyUnicode_Check(result)
14002 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014003 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014004 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014005 Py_DECREF(result);
14006 result = unicode;
14007 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014008 else if (len != PyUnicode_GET_LENGTH(result)) {
14009 if (PyUnicode_Resize(&result, len) < 0)
14010 Py_CLEAR(result);
14011 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014012 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014013}
14014
Ethan Furmandf3ed242014-01-05 06:50:30 -080014015/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014016 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014017 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014018 * -1 and raise an exception on error */
14019static int
Victor Stinnera47082312012-10-04 02:19:54 +020014020mainformatlong(PyObject *v,
14021 struct unicode_format_arg_t *arg,
14022 PyObject **p_output,
14023 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014024{
14025 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014026 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014027
14028 if (!PyNumber_Check(v))
14029 goto wrongtype;
14030
Ethan Furman9ab74802014-03-21 06:38:46 -070014031 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014032 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014033 if (type == 'o' || type == 'x' || type == 'X') {
14034 iobj = PyNumber_Index(v);
14035 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014036 if (PyErr_ExceptionMatches(PyExc_TypeError))
14037 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014038 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014039 }
14040 }
14041 else {
14042 iobj = PyNumber_Long(v);
14043 if (iobj == NULL ) {
14044 if (PyErr_ExceptionMatches(PyExc_TypeError))
14045 goto wrongtype;
14046 return -1;
14047 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014048 }
14049 assert(PyLong_Check(iobj));
14050 }
14051 else {
14052 iobj = v;
14053 Py_INCREF(iobj);
14054 }
14055
14056 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014057 && arg->width == -1 && arg->prec == -1
14058 && !(arg->flags & (F_SIGN | F_BLANK))
14059 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014060 {
14061 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014062 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014063 int base;
14064
Victor Stinnera47082312012-10-04 02:19:54 +020014065 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014066 {
14067 default:
14068 assert(0 && "'type' not in [diuoxX]");
14069 case 'd':
14070 case 'i':
14071 case 'u':
14072 base = 10;
14073 break;
14074 case 'o':
14075 base = 8;
14076 break;
14077 case 'x':
14078 case 'X':
14079 base = 16;
14080 break;
14081 }
14082
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014083 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14084 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014085 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014086 }
14087 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014088 return 1;
14089 }
14090
Victor Stinnera47082312012-10-04 02:19:54 +020014091 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014092 Py_DECREF(iobj);
14093 if (res == NULL)
14094 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014095 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014096 return 0;
14097
14098wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014099 switch(type)
14100 {
14101 case 'o':
14102 case 'x':
14103 case 'X':
14104 PyErr_Format(PyExc_TypeError,
14105 "%%%c format: an integer is required, "
14106 "not %.200s",
14107 type, Py_TYPE(v)->tp_name);
14108 break;
14109 default:
14110 PyErr_Format(PyExc_TypeError,
14111 "%%%c format: a number is required, "
14112 "not %.200s",
14113 type, Py_TYPE(v)->tp_name);
14114 break;
14115 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014116 return -1;
14117}
14118
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014119static Py_UCS4
14120formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014121{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014122 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014123 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014124 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014125 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014126 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014127 goto onError;
14128 }
14129 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014130 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014131 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014132 /* make sure number is a type of integer */
14133 if (!PyLong_Check(v)) {
14134 iobj = PyNumber_Index(v);
14135 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014136 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014137 }
14138 v = iobj;
14139 Py_DECREF(iobj);
14140 }
14141 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014142 x = PyLong_AsLong(v);
14143 if (x == -1 && PyErr_Occurred())
14144 goto onError;
14145
Victor Stinner8faf8212011-12-08 22:14:11 +010014146 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014147 PyErr_SetString(PyExc_OverflowError,
14148 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014149 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014150 }
14151
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014152 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014153 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014154
Benjamin Peterson29060642009-01-31 22:14:21 +000014155 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014156 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014157 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014158 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014159}
14160
Victor Stinnera47082312012-10-04 02:19:54 +020014161/* Parse options of an argument: flags, width, precision.
14162 Handle also "%(name)" syntax.
14163
14164 Return 0 if the argument has been formatted into arg->str.
14165 Return 1 if the argument has been written into ctx->writer,
14166 Raise an exception and return -1 on error. */
14167static int
14168unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14169 struct unicode_format_arg_t *arg)
14170{
14171#define FORMAT_READ(ctx) \
14172 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14173
14174 PyObject *v;
14175
Victor Stinnera47082312012-10-04 02:19:54 +020014176 if (arg->ch == '(') {
14177 /* Get argument value from a dictionary. Example: "%(name)s". */
14178 Py_ssize_t keystart;
14179 Py_ssize_t keylen;
14180 PyObject *key;
14181 int pcount = 1;
14182
14183 if (ctx->dict == NULL) {
14184 PyErr_SetString(PyExc_TypeError,
14185 "format requires a mapping");
14186 return -1;
14187 }
14188 ++ctx->fmtpos;
14189 --ctx->fmtcnt;
14190 keystart = ctx->fmtpos;
14191 /* Skip over balanced parentheses */
14192 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14193 arg->ch = FORMAT_READ(ctx);
14194 if (arg->ch == ')')
14195 --pcount;
14196 else if (arg->ch == '(')
14197 ++pcount;
14198 ctx->fmtpos++;
14199 }
14200 keylen = ctx->fmtpos - keystart - 1;
14201 if (ctx->fmtcnt < 0 || pcount > 0) {
14202 PyErr_SetString(PyExc_ValueError,
14203 "incomplete format key");
14204 return -1;
14205 }
14206 key = PyUnicode_Substring(ctx->fmtstr,
14207 keystart, keystart + keylen);
14208 if (key == NULL)
14209 return -1;
14210 if (ctx->args_owned) {
14211 Py_DECREF(ctx->args);
14212 ctx->args_owned = 0;
14213 }
14214 ctx->args = PyObject_GetItem(ctx->dict, key);
14215 Py_DECREF(key);
14216 if (ctx->args == NULL)
14217 return -1;
14218 ctx->args_owned = 1;
14219 ctx->arglen = -1;
14220 ctx->argidx = -2;
14221 }
14222
14223 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014224 while (--ctx->fmtcnt >= 0) {
14225 arg->ch = FORMAT_READ(ctx);
14226 ctx->fmtpos++;
14227 switch (arg->ch) {
14228 case '-': arg->flags |= F_LJUST; continue;
14229 case '+': arg->flags |= F_SIGN; continue;
14230 case ' ': arg->flags |= F_BLANK; continue;
14231 case '#': arg->flags |= F_ALT; continue;
14232 case '0': arg->flags |= F_ZERO; continue;
14233 }
14234 break;
14235 }
14236
14237 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014238 if (arg->ch == '*') {
14239 v = unicode_format_getnextarg(ctx);
14240 if (v == NULL)
14241 return -1;
14242 if (!PyLong_Check(v)) {
14243 PyErr_SetString(PyExc_TypeError,
14244 "* wants int");
14245 return -1;
14246 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014247 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014248 if (arg->width == -1 && PyErr_Occurred())
14249 return -1;
14250 if (arg->width < 0) {
14251 arg->flags |= F_LJUST;
14252 arg->width = -arg->width;
14253 }
14254 if (--ctx->fmtcnt >= 0) {
14255 arg->ch = FORMAT_READ(ctx);
14256 ctx->fmtpos++;
14257 }
14258 }
14259 else if (arg->ch >= '0' && arg->ch <= '9') {
14260 arg->width = arg->ch - '0';
14261 while (--ctx->fmtcnt >= 0) {
14262 arg->ch = FORMAT_READ(ctx);
14263 ctx->fmtpos++;
14264 if (arg->ch < '0' || arg->ch > '9')
14265 break;
14266 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14267 mixing signed and unsigned comparison. Since arg->ch is between
14268 '0' and '9', casting to int is safe. */
14269 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14270 PyErr_SetString(PyExc_ValueError,
14271 "width too big");
14272 return -1;
14273 }
14274 arg->width = arg->width*10 + (arg->ch - '0');
14275 }
14276 }
14277
14278 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014279 if (arg->ch == '.') {
14280 arg->prec = 0;
14281 if (--ctx->fmtcnt >= 0) {
14282 arg->ch = FORMAT_READ(ctx);
14283 ctx->fmtpos++;
14284 }
14285 if (arg->ch == '*') {
14286 v = unicode_format_getnextarg(ctx);
14287 if (v == NULL)
14288 return -1;
14289 if (!PyLong_Check(v)) {
14290 PyErr_SetString(PyExc_TypeError,
14291 "* wants int");
14292 return -1;
14293 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014294 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014295 if (arg->prec == -1 && PyErr_Occurred())
14296 return -1;
14297 if (arg->prec < 0)
14298 arg->prec = 0;
14299 if (--ctx->fmtcnt >= 0) {
14300 arg->ch = FORMAT_READ(ctx);
14301 ctx->fmtpos++;
14302 }
14303 }
14304 else if (arg->ch >= '0' && arg->ch <= '9') {
14305 arg->prec = arg->ch - '0';
14306 while (--ctx->fmtcnt >= 0) {
14307 arg->ch = FORMAT_READ(ctx);
14308 ctx->fmtpos++;
14309 if (arg->ch < '0' || arg->ch > '9')
14310 break;
14311 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14312 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014313 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014314 return -1;
14315 }
14316 arg->prec = arg->prec*10 + (arg->ch - '0');
14317 }
14318 }
14319 }
14320
14321 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14322 if (ctx->fmtcnt >= 0) {
14323 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14324 if (--ctx->fmtcnt >= 0) {
14325 arg->ch = FORMAT_READ(ctx);
14326 ctx->fmtpos++;
14327 }
14328 }
14329 }
14330 if (ctx->fmtcnt < 0) {
14331 PyErr_SetString(PyExc_ValueError,
14332 "incomplete format");
14333 return -1;
14334 }
14335 return 0;
14336
14337#undef FORMAT_READ
14338}
14339
14340/* Format one argument. Supported conversion specifiers:
14341
14342 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014343 - "i", "d", "u": int or float
14344 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014345 - "e", "E", "f", "F", "g", "G": float
14346 - "c": int or str (1 character)
14347
Victor Stinner8dbd4212012-12-04 09:30:24 +010014348 When possible, the output is written directly into the Unicode writer
14349 (ctx->writer). A string is created when padding is required.
14350
Victor Stinnera47082312012-10-04 02:19:54 +020014351 Return 0 if the argument has been formatted into *p_str,
14352 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014353 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014354static int
14355unicode_format_arg_format(struct unicode_formatter_t *ctx,
14356 struct unicode_format_arg_t *arg,
14357 PyObject **p_str)
14358{
14359 PyObject *v;
14360 _PyUnicodeWriter *writer = &ctx->writer;
14361
14362 if (ctx->fmtcnt == 0)
14363 ctx->writer.overallocate = 0;
14364
14365 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014366 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014367 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014368 return 1;
14369 }
14370
14371 v = unicode_format_getnextarg(ctx);
14372 if (v == NULL)
14373 return -1;
14374
Victor Stinnera47082312012-10-04 02:19:54 +020014375
14376 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014377 case 's':
14378 case 'r':
14379 case 'a':
14380 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14381 /* Fast path */
14382 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14383 return -1;
14384 return 1;
14385 }
14386
14387 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14388 *p_str = v;
14389 Py_INCREF(*p_str);
14390 }
14391 else {
14392 if (arg->ch == 's')
14393 *p_str = PyObject_Str(v);
14394 else if (arg->ch == 'r')
14395 *p_str = PyObject_Repr(v);
14396 else
14397 *p_str = PyObject_ASCII(v);
14398 }
14399 break;
14400
14401 case 'i':
14402 case 'd':
14403 case 'u':
14404 case 'o':
14405 case 'x':
14406 case 'X':
14407 {
14408 int ret = mainformatlong(v, arg, p_str, writer);
14409 if (ret != 0)
14410 return ret;
14411 arg->sign = 1;
14412 break;
14413 }
14414
14415 case 'e':
14416 case 'E':
14417 case 'f':
14418 case 'F':
14419 case 'g':
14420 case 'G':
14421 if (arg->width == -1 && arg->prec == -1
14422 && !(arg->flags & (F_SIGN | F_BLANK)))
14423 {
14424 /* Fast path */
14425 if (formatfloat(v, arg, NULL, writer) == -1)
14426 return -1;
14427 return 1;
14428 }
14429
14430 arg->sign = 1;
14431 if (formatfloat(v, arg, p_str, NULL) == -1)
14432 return -1;
14433 break;
14434
14435 case 'c':
14436 {
14437 Py_UCS4 ch = formatchar(v);
14438 if (ch == (Py_UCS4) -1)
14439 return -1;
14440 if (arg->width == -1 && arg->prec == -1) {
14441 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014442 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014443 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014444 return 1;
14445 }
14446 *p_str = PyUnicode_FromOrdinal(ch);
14447 break;
14448 }
14449
14450 default:
14451 PyErr_Format(PyExc_ValueError,
14452 "unsupported format character '%c' (0x%x) "
14453 "at index %zd",
14454 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14455 (int)arg->ch,
14456 ctx->fmtpos - 1);
14457 return -1;
14458 }
14459 if (*p_str == NULL)
14460 return -1;
14461 assert (PyUnicode_Check(*p_str));
14462 return 0;
14463}
14464
14465static int
14466unicode_format_arg_output(struct unicode_formatter_t *ctx,
14467 struct unicode_format_arg_t *arg,
14468 PyObject *str)
14469{
14470 Py_ssize_t len;
14471 enum PyUnicode_Kind kind;
14472 void *pbuf;
14473 Py_ssize_t pindex;
14474 Py_UCS4 signchar;
14475 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014476 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014477 Py_ssize_t sublen;
14478 _PyUnicodeWriter *writer = &ctx->writer;
14479 Py_UCS4 fill;
14480
14481 fill = ' ';
14482 if (arg->sign && arg->flags & F_ZERO)
14483 fill = '0';
14484
14485 if (PyUnicode_READY(str) == -1)
14486 return -1;
14487
14488 len = PyUnicode_GET_LENGTH(str);
14489 if ((arg->width == -1 || arg->width <= len)
14490 && (arg->prec == -1 || arg->prec >= len)
14491 && !(arg->flags & (F_SIGN | F_BLANK)))
14492 {
14493 /* Fast path */
14494 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14495 return -1;
14496 return 0;
14497 }
14498
14499 /* Truncate the string for "s", "r" and "a" formats
14500 if the precision is set */
14501 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14502 if (arg->prec >= 0 && len > arg->prec)
14503 len = arg->prec;
14504 }
14505
14506 /* Adjust sign and width */
14507 kind = PyUnicode_KIND(str);
14508 pbuf = PyUnicode_DATA(str);
14509 pindex = 0;
14510 signchar = '\0';
14511 if (arg->sign) {
14512 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14513 if (ch == '-' || ch == '+') {
14514 signchar = ch;
14515 len--;
14516 pindex++;
14517 }
14518 else if (arg->flags & F_SIGN)
14519 signchar = '+';
14520 else if (arg->flags & F_BLANK)
14521 signchar = ' ';
14522 else
14523 arg->sign = 0;
14524 }
14525 if (arg->width < len)
14526 arg->width = len;
14527
14528 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014529 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014530 if (!(arg->flags & F_LJUST)) {
14531 if (arg->sign) {
14532 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014533 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014534 }
14535 else {
14536 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014537 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014538 }
14539 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014540 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14541 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014542 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014543 }
14544
Victor Stinnera47082312012-10-04 02:19:54 +020014545 buflen = arg->width;
14546 if (arg->sign && len == arg->width)
14547 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014548 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014549 return -1;
14550
14551 /* Write the sign if needed */
14552 if (arg->sign) {
14553 if (fill != ' ') {
14554 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14555 writer->pos += 1;
14556 }
14557 if (arg->width > len)
14558 arg->width--;
14559 }
14560
14561 /* Write the numeric prefix for "x", "X" and "o" formats
14562 if the alternate form is used.
14563 For example, write "0x" for the "%#x" format. */
14564 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14565 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14566 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14567 if (fill != ' ') {
14568 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14569 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14570 writer->pos += 2;
14571 pindex += 2;
14572 }
14573 arg->width -= 2;
14574 if (arg->width < 0)
14575 arg->width = 0;
14576 len -= 2;
14577 }
14578
14579 /* Pad left with the fill character if needed */
14580 if (arg->width > len && !(arg->flags & F_LJUST)) {
14581 sublen = arg->width - len;
14582 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14583 writer->pos += sublen;
14584 arg->width = len;
14585 }
14586
14587 /* If padding with spaces: write sign if needed and/or numeric prefix if
14588 the alternate form is used */
14589 if (fill == ' ') {
14590 if (arg->sign) {
14591 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14592 writer->pos += 1;
14593 }
14594 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14595 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14596 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14597 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14598 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14599 writer->pos += 2;
14600 pindex += 2;
14601 }
14602 }
14603
14604 /* Write characters */
14605 if (len) {
14606 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14607 str, pindex, len);
14608 writer->pos += len;
14609 }
14610
14611 /* Pad right with the fill character if needed */
14612 if (arg->width > len) {
14613 sublen = arg->width - len;
14614 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14615 writer->pos += sublen;
14616 }
14617 return 0;
14618}
14619
14620/* Helper of PyUnicode_Format(): format one arg.
14621 Return 0 on success, raise an exception and return -1 on error. */
14622static int
14623unicode_format_arg(struct unicode_formatter_t *ctx)
14624{
14625 struct unicode_format_arg_t arg;
14626 PyObject *str;
14627 int ret;
14628
Victor Stinner8dbd4212012-12-04 09:30:24 +010014629 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14630 arg.flags = 0;
14631 arg.width = -1;
14632 arg.prec = -1;
14633 arg.sign = 0;
14634 str = NULL;
14635
Victor Stinnera47082312012-10-04 02:19:54 +020014636 ret = unicode_format_arg_parse(ctx, &arg);
14637 if (ret == -1)
14638 return -1;
14639
14640 ret = unicode_format_arg_format(ctx, &arg, &str);
14641 if (ret == -1)
14642 return -1;
14643
14644 if (ret != 1) {
14645 ret = unicode_format_arg_output(ctx, &arg, str);
14646 Py_DECREF(str);
14647 if (ret == -1)
14648 return -1;
14649 }
14650
14651 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14652 PyErr_SetString(PyExc_TypeError,
14653 "not all arguments converted during string formatting");
14654 return -1;
14655 }
14656 return 0;
14657}
14658
Alexander Belopolsky40018472011-02-26 01:02:56 +000014659PyObject *
14660PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014661{
Victor Stinnera47082312012-10-04 02:19:54 +020014662 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014663
Guido van Rossumd57fd912000-03-10 22:53:23 +000014664 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014665 PyErr_BadInternalCall();
14666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014667 }
Victor Stinnera47082312012-10-04 02:19:54 +020014668
14669 ctx.fmtstr = PyUnicode_FromObject(format);
14670 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014671 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014672 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14673 Py_DECREF(ctx.fmtstr);
14674 return NULL;
14675 }
14676 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14677 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14678 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14679 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014680
Victor Stinner8f674cc2013-04-17 23:02:17 +020014681 _PyUnicodeWriter_Init(&ctx.writer);
14682 ctx.writer.min_length = ctx.fmtcnt + 100;
14683 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014684
Guido van Rossumd57fd912000-03-10 22:53:23 +000014685 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014686 ctx.arglen = PyTuple_Size(args);
14687 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014688 }
14689 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014690 ctx.arglen = -1;
14691 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014692 }
Victor Stinnera47082312012-10-04 02:19:54 +020014693 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014694 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014695 ctx.dict = args;
14696 else
14697 ctx.dict = NULL;
14698 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014699
Victor Stinnera47082312012-10-04 02:19:54 +020014700 while (--ctx.fmtcnt >= 0) {
14701 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014702 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014703
14704 nonfmtpos = ctx.fmtpos++;
14705 while (ctx.fmtcnt >= 0 &&
14706 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14707 ctx.fmtpos++;
14708 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014709 }
Victor Stinnera47082312012-10-04 02:19:54 +020014710 if (ctx.fmtcnt < 0) {
14711 ctx.fmtpos--;
14712 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014713 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014714
Victor Stinnercfc4c132013-04-03 01:48:39 +020014715 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14716 nonfmtpos, ctx.fmtpos) < 0)
14717 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014718 }
14719 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014720 ctx.fmtpos++;
14721 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014722 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014723 }
14724 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014725
Victor Stinnera47082312012-10-04 02:19:54 +020014726 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014727 PyErr_SetString(PyExc_TypeError,
14728 "not all arguments converted during string formatting");
14729 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014730 }
14731
Victor Stinnera47082312012-10-04 02:19:54 +020014732 if (ctx.args_owned) {
14733 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014734 }
Victor Stinnera47082312012-10-04 02:19:54 +020014735 Py_DECREF(ctx.fmtstr);
14736 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014737
Benjamin Peterson29060642009-01-31 22:14:21 +000014738 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014739 Py_DECREF(ctx.fmtstr);
14740 _PyUnicodeWriter_Dealloc(&ctx.writer);
14741 if (ctx.args_owned) {
14742 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014743 }
14744 return NULL;
14745}
14746
Jeremy Hylton938ace62002-07-17 16:30:39 +000014747static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014748unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14749
Tim Peters6d6c1a32001-08-02 04:15:00 +000014750static PyObject *
14751unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14752{
Benjamin Peterson29060642009-01-31 22:14:21 +000014753 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014754 static char *kwlist[] = {"object", "encoding", "errors", 0};
14755 char *encoding = NULL;
14756 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014757
Benjamin Peterson14339b62009-01-31 16:36:08 +000014758 if (type != &PyUnicode_Type)
14759 return unicode_subtype_new(type, args, kwds);
14760 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014761 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014762 return NULL;
14763 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014764 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014765 if (encoding == NULL && errors == NULL)
14766 return PyObject_Str(x);
14767 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014768 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014769}
14770
Guido van Rossume023fe02001-08-30 03:12:59 +000014771static PyObject *
14772unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14773{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014774 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014775 Py_ssize_t length, char_size;
14776 int share_wstr, share_utf8;
14777 unsigned int kind;
14778 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014779
Benjamin Peterson14339b62009-01-31 16:36:08 +000014780 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014781
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014782 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014783 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014784 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014785 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014786 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014787 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014788 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014789 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014790
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014791 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014792 if (self == NULL) {
14793 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014794 return NULL;
14795 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014796 kind = PyUnicode_KIND(unicode);
14797 length = PyUnicode_GET_LENGTH(unicode);
14798
14799 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014800#ifdef Py_DEBUG
14801 _PyUnicode_HASH(self) = -1;
14802#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014803 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014804#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014805 _PyUnicode_STATE(self).interned = 0;
14806 _PyUnicode_STATE(self).kind = kind;
14807 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014808 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014809 _PyUnicode_STATE(self).ready = 1;
14810 _PyUnicode_WSTR(self) = NULL;
14811 _PyUnicode_UTF8_LENGTH(self) = 0;
14812 _PyUnicode_UTF8(self) = NULL;
14813 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014814 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014815
14816 share_utf8 = 0;
14817 share_wstr = 0;
14818 if (kind == PyUnicode_1BYTE_KIND) {
14819 char_size = 1;
14820 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14821 share_utf8 = 1;
14822 }
14823 else if (kind == PyUnicode_2BYTE_KIND) {
14824 char_size = 2;
14825 if (sizeof(wchar_t) == 2)
14826 share_wstr = 1;
14827 }
14828 else {
14829 assert(kind == PyUnicode_4BYTE_KIND);
14830 char_size = 4;
14831 if (sizeof(wchar_t) == 4)
14832 share_wstr = 1;
14833 }
14834
14835 /* Ensure we won't overflow the length. */
14836 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14837 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014838 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014839 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014840 data = PyObject_MALLOC((length + 1) * char_size);
14841 if (data == NULL) {
14842 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014843 goto onError;
14844 }
14845
Victor Stinnerc3c74152011-10-02 20:39:55 +020014846 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014847 if (share_utf8) {
14848 _PyUnicode_UTF8_LENGTH(self) = length;
14849 _PyUnicode_UTF8(self) = data;
14850 }
14851 if (share_wstr) {
14852 _PyUnicode_WSTR_LENGTH(self) = length;
14853 _PyUnicode_WSTR(self) = (wchar_t *)data;
14854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014855
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014856 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014857 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014858 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014859#ifdef Py_DEBUG
14860 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14861#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014862 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014863 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014864
14865onError:
14866 Py_DECREF(unicode);
14867 Py_DECREF(self);
14868 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014869}
14870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014871PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014872"str(object='') -> str\n\
14873str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014874\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014875Create a new string object from the given object. If encoding or\n\
14876errors is specified, then the object must expose a data buffer\n\
14877that will be decoded using the given encoding and error handler.\n\
14878Otherwise, returns the result of object.__str__() (if defined)\n\
14879or repr(object).\n\
14880encoding defaults to sys.getdefaultencoding().\n\
14881errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014882
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014883static PyObject *unicode_iter(PyObject *seq);
14884
Guido van Rossumd57fd912000-03-10 22:53:23 +000014885PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014886 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014887 "str", /* tp_name */
14888 sizeof(PyUnicodeObject), /* tp_size */
14889 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014890 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014891 (destructor)unicode_dealloc, /* tp_dealloc */
14892 0, /* tp_print */
14893 0, /* tp_getattr */
14894 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014895 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014896 unicode_repr, /* tp_repr */
14897 &unicode_as_number, /* tp_as_number */
14898 &unicode_as_sequence, /* tp_as_sequence */
14899 &unicode_as_mapping, /* tp_as_mapping */
14900 (hashfunc) unicode_hash, /* tp_hash*/
14901 0, /* tp_call*/
14902 (reprfunc) unicode_str, /* tp_str */
14903 PyObject_GenericGetAttr, /* tp_getattro */
14904 0, /* tp_setattro */
14905 0, /* tp_as_buffer */
14906 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014907 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014908 unicode_doc, /* tp_doc */
14909 0, /* tp_traverse */
14910 0, /* tp_clear */
14911 PyUnicode_RichCompare, /* tp_richcompare */
14912 0, /* tp_weaklistoffset */
14913 unicode_iter, /* tp_iter */
14914 0, /* tp_iternext */
14915 unicode_methods, /* tp_methods */
14916 0, /* tp_members */
14917 0, /* tp_getset */
14918 &PyBaseObject_Type, /* tp_base */
14919 0, /* tp_dict */
14920 0, /* tp_descr_get */
14921 0, /* tp_descr_set */
14922 0, /* tp_dictoffset */
14923 0, /* tp_init */
14924 0, /* tp_alloc */
14925 unicode_new, /* tp_new */
14926 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014927};
14928
14929/* Initialize the Unicode implementation */
14930
Victor Stinner3a50e702011-10-18 21:21:00 +020014931int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014932{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014933 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014934 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014935 0x000A, /* LINE FEED */
14936 0x000D, /* CARRIAGE RETURN */
14937 0x001C, /* FILE SEPARATOR */
14938 0x001D, /* GROUP SEPARATOR */
14939 0x001E, /* RECORD SEPARATOR */
14940 0x0085, /* NEXT LINE */
14941 0x2028, /* LINE SEPARATOR */
14942 0x2029, /* PARAGRAPH SEPARATOR */
14943 };
14944
Fred Drakee4315f52000-05-09 19:53:39 +000014945 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014946 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014947 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014948 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014949 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014950
Guido van Rossumcacfc072002-05-24 19:01:59 +000014951 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014952 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014953
14954 /* initialize the linebreak bloom filter */
14955 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014956 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014957 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014958
Christian Heimes26532f72013-07-20 14:57:16 +020014959 if (PyType_Ready(&EncodingMapType) < 0)
14960 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014961
Benjamin Petersonc4311282012-10-30 23:21:10 -040014962 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14963 Py_FatalError("Can't initialize field name iterator type");
14964
14965 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14966 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014967
Victor Stinner3a50e702011-10-18 21:21:00 +020014968#ifdef HAVE_MBCS
14969 winver.dwOSVersionInfoSize = sizeof(winver);
14970 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14971 PyErr_SetFromWindowsErr(0);
14972 return -1;
14973 }
14974#endif
14975 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014976}
14977
14978/* Finalize the Unicode implementation */
14979
Christian Heimesa156e092008-02-16 07:38:31 +000014980int
14981PyUnicode_ClearFreeList(void)
14982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014983 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014984}
14985
Guido van Rossumd57fd912000-03-10 22:53:23 +000014986void
Thomas Wouters78890102000-07-22 19:25:51 +000014987_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014988{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014989 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014990
Serhiy Storchaka05997252013-01-26 12:14:02 +020014991 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014992
Serhiy Storchaka05997252013-01-26 12:14:02 +020014993 for (i = 0; i < 256; i++)
14994 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014995 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014996 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014997}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014998
Walter Dörwald16807132007-05-25 13:52:07 +000014999void
15000PyUnicode_InternInPlace(PyObject **p)
15001{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015002 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015003 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015004#ifdef Py_DEBUG
15005 assert(s != NULL);
15006 assert(_PyUnicode_CHECK(s));
15007#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015008 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015009 return;
15010#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015011 /* If it's a subclass, we don't really know what putting
15012 it in the interned dict might do. */
15013 if (!PyUnicode_CheckExact(s))
15014 return;
15015 if (PyUnicode_CHECK_INTERNED(s))
15016 return;
15017 if (interned == NULL) {
15018 interned = PyDict_New();
15019 if (interned == NULL) {
15020 PyErr_Clear(); /* Don't leave an exception */
15021 return;
15022 }
15023 }
15024 /* It might be that the GetItem call fails even
15025 though the key is present in the dictionary,
15026 namely when this happens during a stack overflow. */
15027 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015028 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015029 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015030
Victor Stinnerf0335102013-04-14 19:13:03 +020015031 if (t) {
15032 Py_INCREF(t);
15033 Py_DECREF(*p);
15034 *p = t;
15035 return;
15036 }
Walter Dörwald16807132007-05-25 13:52:07 +000015037
Benjamin Peterson14339b62009-01-31 16:36:08 +000015038 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015039 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015040 PyErr_Clear();
15041 PyThreadState_GET()->recursion_critical = 0;
15042 return;
15043 }
15044 PyThreadState_GET()->recursion_critical = 0;
15045 /* The two references in interned are not counted by refcnt.
15046 The deallocator will take care of this */
15047 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015048 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015049}
15050
15051void
15052PyUnicode_InternImmortal(PyObject **p)
15053{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 PyUnicode_InternInPlace(p);
15055 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015056 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015057 Py_INCREF(*p);
15058 }
Walter Dörwald16807132007-05-25 13:52:07 +000015059}
15060
15061PyObject *
15062PyUnicode_InternFromString(const char *cp)
15063{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015064 PyObject *s = PyUnicode_FromString(cp);
15065 if (s == NULL)
15066 return NULL;
15067 PyUnicode_InternInPlace(&s);
15068 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015069}
15070
Alexander Belopolsky40018472011-02-26 01:02:56 +000015071void
15072_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015073{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015074 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015075 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015076 Py_ssize_t i, n;
15077 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015078
Benjamin Peterson14339b62009-01-31 16:36:08 +000015079 if (interned == NULL || !PyDict_Check(interned))
15080 return;
15081 keys = PyDict_Keys(interned);
15082 if (keys == NULL || !PyList_Check(keys)) {
15083 PyErr_Clear();
15084 return;
15085 }
Walter Dörwald16807132007-05-25 13:52:07 +000015086
Benjamin Peterson14339b62009-01-31 16:36:08 +000015087 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15088 detector, interned unicode strings are not forcibly deallocated;
15089 rather, we give them their stolen references back, and then clear
15090 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015091
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 n = PyList_GET_SIZE(keys);
15093 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015094 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015095 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015096 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015097 if (PyUnicode_READY(s) == -1) {
15098 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015099 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015101 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015102 case SSTATE_NOT_INTERNED:
15103 /* XXX Shouldn't happen */
15104 break;
15105 case SSTATE_INTERNED_IMMORTAL:
15106 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015107 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015108 break;
15109 case SSTATE_INTERNED_MORTAL:
15110 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015111 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015112 break;
15113 default:
15114 Py_FatalError("Inconsistent interned string state.");
15115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015116 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015117 }
15118 fprintf(stderr, "total size of all interned strings: "
15119 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15120 "mortal/immortal\n", mortal_size, immortal_size);
15121 Py_DECREF(keys);
15122 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015123 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015124}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015125
15126
15127/********************* Unicode Iterator **************************/
15128
15129typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015130 PyObject_HEAD
15131 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015132 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015133} unicodeiterobject;
15134
15135static void
15136unicodeiter_dealloc(unicodeiterobject *it)
15137{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015138 _PyObject_GC_UNTRACK(it);
15139 Py_XDECREF(it->it_seq);
15140 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015141}
15142
15143static int
15144unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15145{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015146 Py_VISIT(it->it_seq);
15147 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015148}
15149
15150static PyObject *
15151unicodeiter_next(unicodeiterobject *it)
15152{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015153 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015154
Benjamin Peterson14339b62009-01-31 16:36:08 +000015155 assert(it != NULL);
15156 seq = it->it_seq;
15157 if (seq == NULL)
15158 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015159 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015161 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15162 int kind = PyUnicode_KIND(seq);
15163 void *data = PyUnicode_DATA(seq);
15164 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15165 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015166 if (item != NULL)
15167 ++it->it_index;
15168 return item;
15169 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015170
Benjamin Peterson14339b62009-01-31 16:36:08 +000015171 Py_DECREF(seq);
15172 it->it_seq = NULL;
15173 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015174}
15175
15176static PyObject *
15177unicodeiter_len(unicodeiterobject *it)
15178{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015179 Py_ssize_t len = 0;
15180 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015181 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015182 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015183}
15184
15185PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15186
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015187static PyObject *
15188unicodeiter_reduce(unicodeiterobject *it)
15189{
15190 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015191 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015192 it->it_seq, it->it_index);
15193 } else {
15194 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15195 if (u == NULL)
15196 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015197 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015198 }
15199}
15200
15201PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15202
15203static PyObject *
15204unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15205{
15206 Py_ssize_t index = PyLong_AsSsize_t(state);
15207 if (index == -1 && PyErr_Occurred())
15208 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015209 if (it->it_seq != NULL) {
15210 if (index < 0)
15211 index = 0;
15212 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15213 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15214 it->it_index = index;
15215 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015216 Py_RETURN_NONE;
15217}
15218
15219PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15220
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015221static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015222 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015223 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015224 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15225 reduce_doc},
15226 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15227 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015228 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015229};
15230
15231PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015232 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15233 "str_iterator", /* tp_name */
15234 sizeof(unicodeiterobject), /* tp_basicsize */
15235 0, /* tp_itemsize */
15236 /* methods */
15237 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15238 0, /* tp_print */
15239 0, /* tp_getattr */
15240 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015241 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015242 0, /* tp_repr */
15243 0, /* tp_as_number */
15244 0, /* tp_as_sequence */
15245 0, /* tp_as_mapping */
15246 0, /* tp_hash */
15247 0, /* tp_call */
15248 0, /* tp_str */
15249 PyObject_GenericGetAttr, /* tp_getattro */
15250 0, /* tp_setattro */
15251 0, /* tp_as_buffer */
15252 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15253 0, /* tp_doc */
15254 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15255 0, /* tp_clear */
15256 0, /* tp_richcompare */
15257 0, /* tp_weaklistoffset */
15258 PyObject_SelfIter, /* tp_iter */
15259 (iternextfunc)unicodeiter_next, /* tp_iternext */
15260 unicodeiter_methods, /* tp_methods */
15261 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015262};
15263
15264static PyObject *
15265unicode_iter(PyObject *seq)
15266{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015268
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 if (!PyUnicode_Check(seq)) {
15270 PyErr_BadInternalCall();
15271 return NULL;
15272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015273 if (PyUnicode_READY(seq) == -1)
15274 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15276 if (it == NULL)
15277 return NULL;
15278 it->it_index = 0;
15279 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015280 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 _PyObject_GC_TRACK(it);
15282 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015283}
15284
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015285
15286size_t
15287Py_UNICODE_strlen(const Py_UNICODE *u)
15288{
15289 int res = 0;
15290 while(*u++)
15291 res++;
15292 return res;
15293}
15294
15295Py_UNICODE*
15296Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15297{
15298 Py_UNICODE *u = s1;
15299 while ((*u++ = *s2++));
15300 return s1;
15301}
15302
15303Py_UNICODE*
15304Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15305{
15306 Py_UNICODE *u = s1;
15307 while ((*u++ = *s2++))
15308 if (n-- == 0)
15309 break;
15310 return s1;
15311}
15312
15313Py_UNICODE*
15314Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15315{
15316 Py_UNICODE *u1 = s1;
15317 u1 += Py_UNICODE_strlen(u1);
15318 Py_UNICODE_strcpy(u1, s2);
15319 return s1;
15320}
15321
15322int
15323Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15324{
15325 while (*s1 && *s2 && *s1 == *s2)
15326 s1++, s2++;
15327 if (*s1 && *s2)
15328 return (*s1 < *s2) ? -1 : +1;
15329 if (*s1)
15330 return 1;
15331 if (*s2)
15332 return -1;
15333 return 0;
15334}
15335
15336int
15337Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15338{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015339 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015340 for (; n != 0; n--) {
15341 u1 = *s1;
15342 u2 = *s2;
15343 if (u1 != u2)
15344 return (u1 < u2) ? -1 : +1;
15345 if (u1 == '\0')
15346 return 0;
15347 s1++;
15348 s2++;
15349 }
15350 return 0;
15351}
15352
15353Py_UNICODE*
15354Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15355{
15356 const Py_UNICODE *p;
15357 for (p = s; *p; p++)
15358 if (*p == c)
15359 return (Py_UNICODE*)p;
15360 return NULL;
15361}
15362
15363Py_UNICODE*
15364Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15365{
15366 const Py_UNICODE *p;
15367 p = s + Py_UNICODE_strlen(s);
15368 while (p != s) {
15369 p--;
15370 if (*p == c)
15371 return (Py_UNICODE*)p;
15372 }
15373 return NULL;
15374}
Victor Stinner331ea922010-08-10 16:37:20 +000015375
Victor Stinner71133ff2010-09-01 23:43:53 +000015376Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015377PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015378{
Victor Stinner577db2c2011-10-11 22:12:48 +020015379 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015380 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015382 if (!PyUnicode_Check(unicode)) {
15383 PyErr_BadArgument();
15384 return NULL;
15385 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015386 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015387 if (u == NULL)
15388 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015389 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015390 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015391 PyErr_NoMemory();
15392 return NULL;
15393 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015394 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015395 size *= sizeof(Py_UNICODE);
15396 copy = PyMem_Malloc(size);
15397 if (copy == NULL) {
15398 PyErr_NoMemory();
15399 return NULL;
15400 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015401 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015402 return copy;
15403}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015404
Georg Brandl66c221e2010-10-14 07:04:07 +000015405/* A _string module, to export formatter_parser and formatter_field_name_split
15406 to the string.Formatter class implemented in Python. */
15407
15408static PyMethodDef _string_methods[] = {
15409 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15410 METH_O, PyDoc_STR("split the argument as a field name")},
15411 {"formatter_parser", (PyCFunction) formatter_parser,
15412 METH_O, PyDoc_STR("parse the argument as a format string")},
15413 {NULL, NULL}
15414};
15415
15416static struct PyModuleDef _string_module = {
15417 PyModuleDef_HEAD_INIT,
15418 "_string",
15419 PyDoc_STR("string helper module"),
15420 0,
15421 _string_methods,
15422 NULL,
15423 NULL,
15424 NULL,
15425 NULL
15426};
15427
15428PyMODINIT_FUNC
15429PyInit__string(void)
15430{
15431 return PyModule_Create(&_string_module);
15432}
15433
15434
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015435#ifdef __cplusplus
15436}
15437#endif