blob: 8f6cc9e0b34db3bd99c6fb67bf82c9ebc18f128d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001014 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1015 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001016
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 if (ascii->wstr == data)
1018 printf("shared ");
1019 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001020
Victor Stinnera3b334d2011-10-03 13:53:37 +02001021 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001022 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001023 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1024 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001025 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1026 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001028 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001029}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030#endif
1031
1032PyObject *
1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1034{
1035 PyObject *obj;
1036 PyCompactUnicodeObject *unicode;
1037 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001038 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001039 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 Py_ssize_t char_size;
1041 Py_ssize_t struct_size;
1042
1043 /* Optimization for empty strings */
1044 if (size == 0 && unicode_empty != NULL) {
1045 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001046 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 }
1048
Victor Stinner9e9d6892011-10-04 01:02:02 +02001049 is_ascii = 0;
1050 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 struct_size = sizeof(PyCompactUnicodeObject);
1052 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001053 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 char_size = 1;
1055 is_ascii = 1;
1056 struct_size = sizeof(PyASCIIObject);
1057 }
1058 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001059 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 char_size = 1;
1061 }
1062 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001063 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 char_size = 2;
1065 if (sizeof(wchar_t) == 2)
1066 is_sharing = 1;
1067 }
1068 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001069 if (maxchar > MAX_UNICODE) {
1070 PyErr_SetString(PyExc_SystemError,
1071 "invalid maximum character passed to PyUnicode_New");
1072 return NULL;
1073 }
Victor Stinner8f825062012-04-27 13:55:39 +02001074 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 char_size = 4;
1076 if (sizeof(wchar_t) == 4)
1077 is_sharing = 1;
1078 }
1079
1080 /* Ensure we won't overflow the size. */
1081 if (size < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to PyUnicode_New");
1084 return NULL;
1085 }
1086 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1087 return PyErr_NoMemory();
1088
1089 /* Duplicated allocation code from _PyObject_New() instead of a call to
1090 * PyObject_New() so we are able to allocate space for the object and
1091 * it's data buffer.
1092 */
1093 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1094 if (obj == NULL)
1095 return PyErr_NoMemory();
1096 obj = PyObject_INIT(obj, &PyUnicode_Type);
1097 if (obj == NULL)
1098 return NULL;
1099
1100 unicode = (PyCompactUnicodeObject *)obj;
1101 if (is_ascii)
1102 data = ((PyASCIIObject*)obj) + 1;
1103 else
1104 data = unicode + 1;
1105 _PyUnicode_LENGTH(unicode) = size;
1106 _PyUnicode_HASH(unicode) = -1;
1107 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001108 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 _PyUnicode_STATE(unicode).compact = 1;
1110 _PyUnicode_STATE(unicode).ready = 1;
1111 _PyUnicode_STATE(unicode).ascii = is_ascii;
1112 if (is_ascii) {
1113 ((char*)data)[size] = 0;
1114 _PyUnicode_WSTR(unicode) = NULL;
1115 }
Victor Stinner8f825062012-04-27 13:55:39 +02001116 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 ((char*)data)[size] = 0;
1118 _PyUnicode_WSTR(unicode) = NULL;
1119 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001121 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 else {
1124 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001125 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001128 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 ((Py_UCS4*)data)[size] = 0;
1130 if (is_sharing) {
1131 _PyUnicode_WSTR_LENGTH(unicode) = size;
1132 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1133 }
1134 else {
1135 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1136 _PyUnicode_WSTR(unicode) = NULL;
1137 }
1138 }
Victor Stinner8f825062012-04-27 13:55:39 +02001139#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001140 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001141#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001142 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 return obj;
1144}
1145
1146#if SIZEOF_WCHAR_T == 2
1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1148 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001149 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
1151 This function assumes that unicode can hold one more code point than wstr
1152 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001153static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001155 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
1157 const wchar_t *iter;
1158 Py_UCS4 *ucs4_out;
1159
Victor Stinner910337b2011-10-03 03:20:16 +02001160 assert(unicode != NULL);
1161 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1163 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1164
1165 for (iter = begin; iter < end; ) {
1166 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1167 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001168 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1169 && (iter+1) < end
1170 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 {
Victor Stinner551ac952011-11-29 22:58:13 +01001172 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 iter += 2;
1174 }
1175 else {
1176 *ucs4_out++ = *iter;
1177 iter++;
1178 }
1179 }
1180 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1181 _PyUnicode_GET_LENGTH(unicode)));
1182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001183}
1184#endif
1185
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186static int
Victor Stinner488fa492011-12-12 00:01:39 +01001187unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001188{
Victor Stinner488fa492011-12-12 00:01:39 +01001189 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001190 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001191 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return -1;
1193 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001194 return 0;
1195}
1196
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001197static int
1198_copy_characters(PyObject *to, Py_ssize_t to_start,
1199 PyObject *from, Py_ssize_t from_start,
1200 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001202 unsigned int from_kind, to_kind;
1203 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204
Victor Stinneree4544c2012-05-09 22:24:08 +02001205 assert(0 <= how_many);
1206 assert(0 <= from_start);
1207 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001208 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001209 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001210 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211
Victor Stinnerd3f08822012-05-29 12:57:52 +02001212 assert(PyUnicode_Check(to));
1213 assert(PyUnicode_IS_READY(to));
1214 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1215
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001216 if (how_many == 0)
1217 return 0;
1218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001222 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001223
Victor Stinnerf1852262012-06-16 16:38:26 +02001224#ifdef Py_DEBUG
1225 if (!check_maxchar
1226 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1227 {
1228 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1229 Py_UCS4 ch;
1230 Py_ssize_t i;
1231 for (i=0; i < how_many; i++) {
1232 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1233 assert(ch <= to_maxchar);
1234 }
1235 }
1236#endif
1237
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001238 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001239 if (check_maxchar
1240 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1241 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 /* Writing Latin-1 characters into an ASCII string requires to
1243 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001244 Py_UCS4 max_char;
1245 max_char = ucs1lib_find_max_char(from_data,
1246 (Py_UCS1*)from_data + how_many);
1247 if (max_char >= 128)
1248 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001249 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001250 Py_MEMCPY((char*)to_data + to_kind * to_start,
1251 (char*)from_data + from_kind * from_start,
1252 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001254 else if (from_kind == PyUnicode_1BYTE_KIND
1255 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001256 {
1257 _PyUnicode_CONVERT_BYTES(
1258 Py_UCS1, Py_UCS2,
1259 PyUnicode_1BYTE_DATA(from) + from_start,
1260 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1261 PyUnicode_2BYTE_DATA(to) + to_start
1262 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001264 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001265 && to_kind == PyUnicode_4BYTE_KIND)
1266 {
1267 _PyUnicode_CONVERT_BYTES(
1268 Py_UCS1, Py_UCS4,
1269 PyUnicode_1BYTE_DATA(from) + from_start,
1270 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1271 PyUnicode_4BYTE_DATA(to) + to_start
1272 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001273 }
1274 else if (from_kind == PyUnicode_2BYTE_KIND
1275 && to_kind == PyUnicode_4BYTE_KIND)
1276 {
1277 _PyUnicode_CONVERT_BYTES(
1278 Py_UCS2, Py_UCS4,
1279 PyUnicode_2BYTE_DATA(from) + from_start,
1280 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1281 PyUnicode_4BYTE_DATA(to) + to_start
1282 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001283 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001284 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1286
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001287 if (!check_maxchar) {
1288 if (from_kind == PyUnicode_2BYTE_KIND
1289 && to_kind == PyUnicode_1BYTE_KIND)
1290 {
1291 _PyUnicode_CONVERT_BYTES(
1292 Py_UCS2, Py_UCS1,
1293 PyUnicode_2BYTE_DATA(from) + from_start,
1294 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1295 PyUnicode_1BYTE_DATA(to) + to_start
1296 );
1297 }
1298 else if (from_kind == PyUnicode_4BYTE_KIND
1299 && to_kind == PyUnicode_1BYTE_KIND)
1300 {
1301 _PyUnicode_CONVERT_BYTES(
1302 Py_UCS4, Py_UCS1,
1303 PyUnicode_4BYTE_DATA(from) + from_start,
1304 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1305 PyUnicode_1BYTE_DATA(to) + to_start
1306 );
1307 }
1308 else if (from_kind == PyUnicode_4BYTE_KIND
1309 && to_kind == PyUnicode_2BYTE_KIND)
1310 {
1311 _PyUnicode_CONVERT_BYTES(
1312 Py_UCS4, Py_UCS2,
1313 PyUnicode_4BYTE_DATA(from) + from_start,
1314 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1315 PyUnicode_2BYTE_DATA(to) + to_start
1316 );
1317 }
1318 else {
1319 assert(0);
1320 return -1;
1321 }
1322 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001323 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001325 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 Py_ssize_t i;
1327
Victor Stinnera0702ab2011-09-29 14:14:38 +02001328 for (i=0; i < how_many; i++) {
1329 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001330 if (ch > to_maxchar)
1331 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1333 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001334 }
1335 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001336 return 0;
1337}
1338
Victor Stinnerd3f08822012-05-29 12:57:52 +02001339void
1340_PyUnicode_FastCopyCharacters(
1341 PyObject *to, Py_ssize_t to_start,
1342 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001343{
1344 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1345}
1346
1347Py_ssize_t
1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1349 PyObject *from, Py_ssize_t from_start,
1350 Py_ssize_t how_many)
1351{
1352 int err;
1353
1354 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1355 PyErr_BadInternalCall();
1356 return -1;
1357 }
1358
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001361 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001362 return -1;
1363
Victor Stinnerd3f08822012-05-29 12:57:52 +02001364 if (from_start < 0) {
1365 PyErr_SetString(PyExc_IndexError, "string index out of range");
1366 return -1;
1367 }
1368 if (to_start < 0) {
1369 PyErr_SetString(PyExc_IndexError, "string index out of range");
1370 return -1;
1371 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001372 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1373 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1374 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001375 "Cannot write %zi characters at %zi "
1376 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001377 how_many, to_start, PyUnicode_GET_LENGTH(to));
1378 return -1;
1379 }
1380
1381 if (how_many == 0)
1382 return 0;
1383
Victor Stinner488fa492011-12-12 00:01:39 +01001384 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385 return -1;
1386
1387 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1388 if (err) {
1389 PyErr_Format(PyExc_SystemError,
1390 "Cannot copy %s characters "
1391 "into a string of %s characters",
1392 unicode_kind_name(from),
1393 unicode_kind_name(to));
1394 return -1;
1395 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001396 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397}
1398
Victor Stinner17222162011-09-28 22:15:37 +02001399/* Find the maximum code point and count the number of surrogate pairs so a
1400 correct string length can be computed before converting a string to UCS4.
1401 This function counts single surrogates as a character and not as a pair.
1402
1403 Return 0 on success, or -1 on error. */
1404static int
1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1406 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407{
1408 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001409 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerc53be962011-10-02 21:33:54 +02001411 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 *num_surrogates = 0;
1413 *maxchar = 0;
1414
1415 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001417 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1418 && (iter+1) < end
1419 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1420 {
1421 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1422 ++(*num_surrogates);
1423 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 }
1425 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001427 {
1428 ch = *iter;
1429 iter++;
1430 }
1431 if (ch > *maxchar) {
1432 *maxchar = ch;
1433 if (*maxchar > MAX_UNICODE) {
1434 PyErr_Format(PyExc_ValueError,
1435 "character U+%x is not in range [U+0000; U+10ffff]",
1436 ch);
1437 return -1;
1438 }
1439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
1441 return 0;
1442}
1443
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001444int
1445_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446{
1447 wchar_t *end;
1448 Py_UCS4 maxchar = 0;
1449 Py_ssize_t num_surrogates;
1450#if SIZEOF_WCHAR_T == 2
1451 Py_ssize_t length_wo_surrogates;
1452#endif
1453
Georg Brandl7597add2011-10-05 16:36:47 +02001454 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001455 strings were created using _PyObject_New() and where no canonical
1456 representation (the str field) has been set yet aka strings
1457 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001458 assert(_PyUnicode_CHECK(unicode));
1459 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001461 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001462 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001463 /* Actually, it should neither be interned nor be anything else: */
1464 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001467 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001468 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
1471 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001472 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1473 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 PyErr_NoMemory();
1475 return -1;
1476 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001477 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 _PyUnicode_WSTR(unicode), end,
1479 PyUnicode_1BYTE_DATA(unicode));
1480 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1481 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1482 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1483 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001484 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001486 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 }
1488 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001489 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001490 _PyUnicode_UTF8(unicode) = NULL;
1491 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 }
1493 PyObject_FREE(_PyUnicode_WSTR(unicode));
1494 _PyUnicode_WSTR(unicode) = NULL;
1495 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1496 }
1497 /* In this case we might have to convert down from 4-byte native
1498 wchar_t to 2-byte unicode. */
1499 else if (maxchar < 65536) {
1500 assert(num_surrogates == 0 &&
1501 "FindMaxCharAndNumSurrogatePairs() messed up");
1502
Victor Stinner506f5922011-09-28 22:34:18 +02001503#if SIZEOF_WCHAR_T == 2
1504 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001505 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001506 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1508 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001509 _PyUnicode_UTF8(unicode) = NULL;
1510 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001511#else
1512 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001514 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001515 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001516 PyErr_NoMemory();
1517 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 }
Victor Stinner506f5922011-09-28 22:34:18 +02001519 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1520 _PyUnicode_WSTR(unicode), end,
1521 PyUnicode_2BYTE_DATA(unicode));
1522 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1523 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1524 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001525 _PyUnicode_UTF8(unicode) = NULL;
1526 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001527 PyObject_FREE(_PyUnicode_WSTR(unicode));
1528 _PyUnicode_WSTR(unicode) = NULL;
1529 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1530#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 }
1532 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1533 else {
1534#if SIZEOF_WCHAR_T == 2
1535 /* in case the native representation is 2-bytes, we need to allocate a
1536 new normalized 4-byte version. */
1537 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001538 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1539 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 PyErr_NoMemory();
1541 return -1;
1542 }
1543 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1544 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001545 _PyUnicode_UTF8(unicode) = NULL;
1546 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001547 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1548 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001549 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 PyObject_FREE(_PyUnicode_WSTR(unicode));
1551 _PyUnicode_WSTR(unicode) = NULL;
1552 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1553#else
1554 assert(num_surrogates == 0);
1555
Victor Stinnerc3c74152011-10-02 20:39:55 +02001556 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001558 _PyUnicode_UTF8(unicode) = NULL;
1559 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1561#endif
1562 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1563 }
1564 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001565 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 return 0;
1567}
1568
Alexander Belopolsky40018472011-02-26 01:02:56 +00001569static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001570unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571{
Walter Dörwald16807132007-05-25 13:52:07 +00001572 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 case SSTATE_NOT_INTERNED:
1574 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001575
Benjamin Peterson29060642009-01-31 22:14:21 +00001576 case SSTATE_INTERNED_MORTAL:
1577 /* revive dead object temporarily for DelItem */
1578 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001579 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 Py_FatalError(
1581 "deletion of interned string failed");
1582 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001583
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 case SSTATE_INTERNED_IMMORTAL:
1585 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001586
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 default:
1588 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001589 }
1590
Victor Stinner03490912011-10-03 23:45:12 +02001591 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001593 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001594 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001595 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1596 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001598 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599}
1600
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601#ifdef Py_DEBUG
1602static int
1603unicode_is_singleton(PyObject *unicode)
1604{
1605 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1606 if (unicode == unicode_empty)
1607 return 1;
1608 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1609 {
1610 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1611 if (ch < 256 && unicode_latin1[ch] == unicode)
1612 return 1;
1613 }
1614 return 0;
1615}
1616#endif
1617
Alexander Belopolsky40018472011-02-26 01:02:56 +00001618static int
Victor Stinner488fa492011-12-12 00:01:39 +01001619unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001620{
Victor Stinner488fa492011-12-12 00:01:39 +01001621 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001622 if (Py_REFCNT(unicode) != 1)
1623 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001624 if (_PyUnicode_HASH(unicode) != -1)
1625 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001626 if (PyUnicode_CHECK_INTERNED(unicode))
1627 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001628 if (!PyUnicode_CheckExact(unicode))
1629 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001630#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001631 /* singleton refcount is greater than 1 */
1632 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001633#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001634 return 1;
1635}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636
Victor Stinnerfe226c02011-10-03 03:52:20 +02001637static int
1638unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1639{
1640 PyObject *unicode;
1641 Py_ssize_t old_length;
1642
1643 assert(p_unicode != NULL);
1644 unicode = *p_unicode;
1645
1646 assert(unicode != NULL);
1647 assert(PyUnicode_Check(unicode));
1648 assert(0 <= length);
1649
Victor Stinner910337b2011-10-03 03:20:16 +02001650 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001651 old_length = PyUnicode_WSTR_LENGTH(unicode);
1652 else
1653 old_length = PyUnicode_GET_LENGTH(unicode);
1654 if (old_length == length)
1655 return 0;
1656
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001657 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001658 _Py_INCREF_UNICODE_EMPTY();
1659 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001660 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 Py_DECREF(*p_unicode);
1662 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001663 return 0;
1664 }
1665
Victor Stinner488fa492011-12-12 00:01:39 +01001666 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001667 PyObject *copy = resize_copy(unicode, length);
1668 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001670 Py_DECREF(*p_unicode);
1671 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001672 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001673 }
1674
Victor Stinnerfe226c02011-10-03 03:52:20 +02001675 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001676 PyObject *new_unicode = resize_compact(unicode, length);
1677 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001679 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001681 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001682 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001683}
1684
Alexander Belopolsky40018472011-02-26 01:02:56 +00001685int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001687{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688 PyObject *unicode;
1689 if (p_unicode == NULL) {
1690 PyErr_BadInternalCall();
1691 return -1;
1692 }
1693 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001694 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001695 {
1696 PyErr_BadInternalCall();
1697 return -1;
1698 }
1699 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001700}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001701
Victor Stinnerc5166102012-02-22 13:55:02 +01001702/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001703
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001704 WARNING: The function doesn't copy the terminating null character and
1705 doesn't check the maximum character (may write a latin1 character in an
1706 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001707static void
1708unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1709 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001710{
1711 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1712 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001713 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001714
1715 switch (kind) {
1716 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001717 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001718#ifdef Py_DEBUG
1719 if (PyUnicode_IS_ASCII(unicode)) {
1720 Py_UCS4 maxchar = ucs1lib_find_max_char(
1721 (const Py_UCS1*)str,
1722 (const Py_UCS1*)str + len);
1723 assert(maxchar < 128);
1724 }
1725#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001726 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001727 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 }
1729 case PyUnicode_2BYTE_KIND: {
1730 Py_UCS2 *start = (Py_UCS2 *)data + index;
1731 Py_UCS2 *ucs2 = start;
1732 assert(index <= PyUnicode_GET_LENGTH(unicode));
1733
Victor Stinner184252a2012-06-16 02:57:41 +02001734 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 *ucs2 = (Py_UCS2)*str;
1736
1737 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001738 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001739 }
1740 default: {
1741 Py_UCS4 *start = (Py_UCS4 *)data + index;
1742 Py_UCS4 *ucs4 = start;
1743 assert(kind == PyUnicode_4BYTE_KIND);
1744 assert(index <= PyUnicode_GET_LENGTH(unicode));
1745
Victor Stinner184252a2012-06-16 02:57:41 +02001746 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001747 *ucs4 = (Py_UCS4)*str;
1748
1749 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001750 }
1751 }
1752}
1753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754static PyObject*
1755get_latin1_char(unsigned char ch)
1756{
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001759 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 if (!unicode)
1761 return NULL;
1762 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001763 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 unicode_latin1[ch] = unicode;
1765 }
1766 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001767 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768}
1769
Victor Stinner985a82a2014-01-03 12:53:47 +01001770static PyObject*
1771unicode_char(Py_UCS4 ch)
1772{
1773 PyObject *unicode;
1774
1775 assert(ch <= MAX_UNICODE);
1776
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001777 if (ch < 256)
1778 return get_latin1_char(ch);
1779
Victor Stinner985a82a2014-01-03 12:53:47 +01001780 unicode = PyUnicode_New(1, ch);
1781 if (unicode == NULL)
1782 return NULL;
1783 switch (PyUnicode_KIND(unicode)) {
1784 case PyUnicode_1BYTE_KIND:
1785 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1786 break;
1787 case PyUnicode_2BYTE_KIND:
1788 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1789 break;
1790 default:
1791 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1792 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1793 }
1794 assert(_PyUnicode_CheckConsistency(unicode, 1));
1795 return unicode;
1796}
1797
Alexander Belopolsky40018472011-02-26 01:02:56 +00001798PyObject *
1799PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001801 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 Py_UCS4 maxchar = 0;
1803 Py_ssize_t num_surrogates;
1804
1805 if (u == NULL)
1806 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808 /* If the Unicode data is known at construction time, we can apply
1809 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001812 if (size == 0)
1813 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 /* Single character Unicode objects in the Latin-1 range are
1816 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001817 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return get_latin1_char((unsigned char)*u);
1819
1820 /* If not empty and not single character, copy the Unicode data
1821 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001822 if (find_maxchar_surrogates(u, u + size,
1823 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 return NULL;
1825
Victor Stinner8faf8212011-12-08 22:14:11 +01001826 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 if (!unicode)
1828 return NULL;
1829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 switch (PyUnicode_KIND(unicode)) {
1831 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001832 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1834 break;
1835 case PyUnicode_2BYTE_KIND:
1836#if Py_UNICODE_SIZE == 2
1837 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1838#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001839 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1841#endif
1842 break;
1843 case PyUnicode_4BYTE_KIND:
1844#if SIZEOF_WCHAR_T == 2
1845 /* This is the only case which has to process surrogates, thus
1846 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001847 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848#else
1849 assert(num_surrogates == 0);
1850 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1851#endif
1852 break;
1853 default:
1854 assert(0 && "Impossible state");
1855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001857 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858}
1859
Alexander Belopolsky40018472011-02-26 01:02:56 +00001860PyObject *
1861PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001862{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001863 if (size < 0) {
1864 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001865 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 return NULL;
1867 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001868 if (u != NULL)
1869 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1870 else
1871 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001872}
1873
Alexander Belopolsky40018472011-02-26 01:02:56 +00001874PyObject *
1875PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001876{
1877 size_t size = strlen(u);
1878 if (size > PY_SSIZE_T_MAX) {
1879 PyErr_SetString(PyExc_OverflowError, "input too long");
1880 return NULL;
1881 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001882 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001883}
1884
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001885PyObject *
1886_PyUnicode_FromId(_Py_Identifier *id)
1887{
1888 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001889 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1890 strlen(id->string),
1891 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001892 if (!id->object)
1893 return NULL;
1894 PyUnicode_InternInPlace(&id->object);
1895 assert(!id->next);
1896 id->next = static_strings;
1897 static_strings = id;
1898 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001899 return id->object;
1900}
1901
1902void
1903_PyUnicode_ClearStaticStrings()
1904{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001905 _Py_Identifier *tmp, *s = static_strings;
1906 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001907 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001908 tmp = s->next;
1909 s->next = NULL;
1910 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001911 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001912 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001913}
1914
Benjamin Peterson0df54292012-03-26 14:50:32 -04001915/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916
Victor Stinnerd3f08822012-05-29 12:57:52 +02001917PyObject*
1918_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001919{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001920 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001921 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001922 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001923#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001924 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001925#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001926 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001927 }
Victor Stinner785938e2011-12-11 20:09:03 +01001928 unicode = PyUnicode_New(size, 127);
1929 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001930 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001931 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1932 assert(_PyUnicode_CheckConsistency(unicode, 1));
1933 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001934}
1935
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001936static Py_UCS4
1937kind_maxchar_limit(unsigned int kind)
1938{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001939 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001940 case PyUnicode_1BYTE_KIND:
1941 return 0x80;
1942 case PyUnicode_2BYTE_KIND:
1943 return 0x100;
1944 case PyUnicode_4BYTE_KIND:
1945 return 0x10000;
1946 default:
1947 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001948 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001949 }
1950}
1951
Victor Stinnere6abb482012-05-02 01:15:40 +02001952Py_LOCAL_INLINE(Py_UCS4)
1953align_maxchar(Py_UCS4 maxchar)
1954{
1955 if (maxchar <= 127)
1956 return 127;
1957 else if (maxchar <= 255)
1958 return 255;
1959 else if (maxchar <= 65535)
1960 return 65535;
1961 else
1962 return MAX_UNICODE;
1963}
1964
Victor Stinner702c7342011-10-05 13:50:52 +02001965static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001966_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001969 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001970
Serhiy Storchaka678db842013-01-26 12:16:36 +02001971 if (size == 0)
1972 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001973 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001974 if (size == 1)
1975 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001976
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001977 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001978 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 if (!res)
1980 return NULL;
1981 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001982 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001984}
1985
Victor Stinnere57b1c02011-09-28 22:20:48 +02001986static PyObject*
1987_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988{
1989 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001990 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001991
Serhiy Storchaka678db842013-01-26 12:16:36 +02001992 if (size == 0)
1993 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001994 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 if (size == 1)
1996 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001998 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001999 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 if (!res)
2001 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002004 else {
2005 _PyUnicode_CONVERT_BYTES(
2006 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2007 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002008 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 return res;
2010}
2011
Victor Stinnere57b1c02011-09-28 22:20:48 +02002012static PyObject*
2013_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014{
2015 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002016 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002017
Serhiy Storchaka678db842013-01-26 12:16:36 +02002018 if (size == 0)
2019 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002020 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002021 if (size == 1)
2022 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002024 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002025 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 if (!res)
2027 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002028 if (max_char < 256)
2029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2030 PyUnicode_1BYTE_DATA(res));
2031 else if (max_char < 0x10000)
2032 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2033 PyUnicode_2BYTE_DATA(res));
2034 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002036 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 return res;
2038}
2039
2040PyObject*
2041PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2042{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002043 if (size < 0) {
2044 PyErr_SetString(PyExc_ValueError, "size must be positive");
2045 return NULL;
2046 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002047 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002053 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002054 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002055 PyErr_SetString(PyExc_SystemError, "invalid kind");
2056 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058}
2059
Victor Stinnerece58de2012-04-23 23:36:38 +02002060Py_UCS4
2061_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2062{
2063 enum PyUnicode_Kind kind;
2064 void *startptr, *endptr;
2065
2066 assert(PyUnicode_IS_READY(unicode));
2067 assert(0 <= start);
2068 assert(end <= PyUnicode_GET_LENGTH(unicode));
2069 assert(start <= end);
2070
2071 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2072 return PyUnicode_MAX_CHAR_VALUE(unicode);
2073
2074 if (start == end)
2075 return 127;
2076
Victor Stinner94d558b2012-04-27 22:26:58 +02002077 if (PyUnicode_IS_ASCII(unicode))
2078 return 127;
2079
Victor Stinnerece58de2012-04-23 23:36:38 +02002080 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002081 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002082 endptr = (char *)startptr + end * kind;
2083 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002084 switch(kind) {
2085 case PyUnicode_1BYTE_KIND:
2086 return ucs1lib_find_max_char(startptr, endptr);
2087 case PyUnicode_2BYTE_KIND:
2088 return ucs2lib_find_max_char(startptr, endptr);
2089 case PyUnicode_4BYTE_KIND:
2090 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002091 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002092 assert(0);
2093 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002094 }
2095}
2096
Victor Stinner25a4b292011-10-06 12:31:55 +02002097/* Ensure that a string uses the most efficient storage, if it is not the
2098 case: create a new string with of the right kind. Write NULL into *p_unicode
2099 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002100static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002101unicode_adjust_maxchar(PyObject **p_unicode)
2102{
2103 PyObject *unicode, *copy;
2104 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002105 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002106 unsigned int kind;
2107
2108 assert(p_unicode != NULL);
2109 unicode = *p_unicode;
2110 assert(PyUnicode_IS_READY(unicode));
2111 if (PyUnicode_IS_ASCII(unicode))
2112 return;
2113
2114 len = PyUnicode_GET_LENGTH(unicode);
2115 kind = PyUnicode_KIND(unicode);
2116 if (kind == PyUnicode_1BYTE_KIND) {
2117 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002118 max_char = ucs1lib_find_max_char(u, u + len);
2119 if (max_char >= 128)
2120 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002121 }
2122 else if (kind == PyUnicode_2BYTE_KIND) {
2123 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002124 max_char = ucs2lib_find_max_char(u, u + len);
2125 if (max_char >= 256)
2126 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002127 }
2128 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 max_char = ucs4lib_find_max_char(u, u + len);
2132 if (max_char >= 0x10000)
2133 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002135 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002136 if (copy != NULL)
2137 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002138 Py_DECREF(unicode);
2139 *p_unicode = copy;
2140}
2141
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002143_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002144{
Victor Stinner87af4f22011-11-21 23:03:47 +01002145 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002147
Victor Stinner034f6cf2011-09-30 02:26:44 +02002148 if (!PyUnicode_Check(unicode)) {
2149 PyErr_BadInternalCall();
2150 return NULL;
2151 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002152 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002154
Victor Stinner87af4f22011-11-21 23:03:47 +01002155 length = PyUnicode_GET_LENGTH(unicode);
2156 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002157 if (!copy)
2158 return NULL;
2159 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2160
Victor Stinner87af4f22011-11-21 23:03:47 +01002161 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2162 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002163 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002164 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002165}
2166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167
Victor Stinnerbc603d12011-10-02 01:00:40 +02002168/* Widen Unicode objects to larger buffers. Don't write terminating null
2169 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170
2171void*
2172_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2173{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 Py_ssize_t len;
2175 void *result;
2176 unsigned int skind;
2177
Benjamin Petersonbac79492012-01-14 13:34:47 -05002178 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002179 return NULL;
2180
2181 len = PyUnicode_GET_LENGTH(s);
2182 skind = PyUnicode_KIND(s);
2183 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002184 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return NULL;
2186 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002187 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002188 case PyUnicode_2BYTE_KIND:
2189 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2190 if (!result)
2191 return PyErr_NoMemory();
2192 assert(skind == PyUnicode_1BYTE_KIND);
2193 _PyUnicode_CONVERT_BYTES(
2194 Py_UCS1, Py_UCS2,
2195 PyUnicode_1BYTE_DATA(s),
2196 PyUnicode_1BYTE_DATA(s) + len,
2197 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002199 case PyUnicode_4BYTE_KIND:
2200 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2201 if (!result)
2202 return PyErr_NoMemory();
2203 if (skind == PyUnicode_2BYTE_KIND) {
2204 _PyUnicode_CONVERT_BYTES(
2205 Py_UCS2, Py_UCS4,
2206 PyUnicode_2BYTE_DATA(s),
2207 PyUnicode_2BYTE_DATA(s) + len,
2208 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002210 else {
2211 assert(skind == PyUnicode_1BYTE_KIND);
2212 _PyUnicode_CONVERT_BYTES(
2213 Py_UCS1, Py_UCS4,
2214 PyUnicode_1BYTE_DATA(s),
2215 PyUnicode_1BYTE_DATA(s) + len,
2216 result);
2217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002219 default:
2220 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 }
Victor Stinner01698042011-10-04 00:04:26 +02002222 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 return NULL;
2224}
2225
2226static Py_UCS4*
2227as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228 int copy_null)
2229{
2230 int kind;
2231 void *data;
2232 Py_ssize_t len, targetlen;
2233 if (PyUnicode_READY(string) == -1)
2234 return NULL;
2235 kind = PyUnicode_KIND(string);
2236 data = PyUnicode_DATA(string);
2237 len = PyUnicode_GET_LENGTH(string);
2238 targetlen = len;
2239 if (copy_null)
2240 targetlen++;
2241 if (!target) {
2242 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2247 if (!target) {
2248 PyErr_NoMemory();
2249 return NULL;
2250 }
2251 }
2252 else {
2253 if (targetsize < targetlen) {
2254 PyErr_Format(PyExc_SystemError,
2255 "string is longer than the buffer");
2256 if (copy_null && 0 < targetsize)
2257 target[0] = 0;
2258 return NULL;
2259 }
2260 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002261 if (kind == PyUnicode_1BYTE_KIND) {
2262 Py_UCS1 *start = (Py_UCS1 *) data;
2263 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 else if (kind == PyUnicode_2BYTE_KIND) {
2266 Py_UCS2 *start = (Py_UCS2 *) data;
2267 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2268 }
2269 else {
2270 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 if (copy_null)
2274 target[len] = 0;
2275 return target;
2276}
2277
2278Py_UCS4*
2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2280 int copy_null)
2281{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002282 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 PyErr_BadInternalCall();
2284 return NULL;
2285 }
2286 return as_ucs4(string, target, targetsize, copy_null);
2287}
2288
2289Py_UCS4*
2290PyUnicode_AsUCS4Copy(PyObject *string)
2291{
2292 return as_ucs4(string, NULL, 0, 1);
2293}
2294
2295#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002296
Alexander Belopolsky40018472011-02-26 01:02:56 +00002297PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002302 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 PyErr_BadInternalCall();
2304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 }
2306
Martin v. Löwis790465f2008-04-05 20:41:37 +00002307 if (size == -1) {
2308 size = wcslen(w);
2309 }
2310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312}
2313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002315
Walter Dörwald346737f2007-05-31 10:44:43 +00002316static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002317makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002318 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002319{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002321 if (longflag)
2322 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002323 else if (longlongflag) {
2324 /* longlongflag should only ever be nonzero on machines with
2325 HAVE_LONG_LONG defined */
2326#ifdef HAVE_LONG_LONG
2327 char *f = PY_FORMAT_LONG_LONG;
2328 while (*f)
2329 *fmt++ = *f++;
2330#else
2331 /* we shouldn't ever get here */
2332 assert(0);
2333 *fmt++ = 'l';
2334#endif
2335 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002336 else if (size_tflag) {
2337 char *f = PY_FORMAT_SIZE_T;
2338 while (*f)
2339 *fmt++ = *f++;
2340 }
2341 *fmt++ = c;
2342 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002343}
2344
Victor Stinner15a11362012-10-06 23:48:20 +02002345/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002346 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2347 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2348#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002349
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002350static int
2351unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2352 Py_ssize_t width, Py_ssize_t precision)
2353{
2354 Py_ssize_t length, fill, arglen;
2355 Py_UCS4 maxchar;
2356
2357 if (PyUnicode_READY(str) == -1)
2358 return -1;
2359
2360 length = PyUnicode_GET_LENGTH(str);
2361 if ((precision == -1 || precision >= length)
2362 && width <= length)
2363 return _PyUnicodeWriter_WriteStr(writer, str);
2364
2365 if (precision != -1)
2366 length = Py_MIN(precision, length);
2367
2368 arglen = Py_MAX(length, width);
2369 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2370 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2371 else
2372 maxchar = writer->maxchar;
2373
2374 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2375 return -1;
2376
2377 if (width > length) {
2378 fill = width - length;
2379 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2380 return -1;
2381 writer->pos += fill;
2382 }
2383
2384 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2385 str, 0, length);
2386 writer->pos += length;
2387 return 0;
2388}
2389
2390static int
2391unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2392 Py_ssize_t width, Py_ssize_t precision)
2393{
2394 /* UTF-8 */
2395 Py_ssize_t length;
2396 PyObject *unicode;
2397 int res;
2398
2399 length = strlen(str);
2400 if (precision != -1)
2401 length = Py_MIN(length, precision);
2402 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2403 if (unicode == NULL)
2404 return -1;
2405
2406 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2407 Py_DECREF(unicode);
2408 return res;
2409}
2410
Victor Stinner96865452011-03-01 23:44:09 +00002411static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002412unicode_fromformat_arg(_PyUnicodeWriter *writer,
2413 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002414{
Victor Stinnere215d962012-10-06 23:03:36 +02002415 const char *p;
2416 Py_ssize_t len;
2417 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 Py_ssize_t width;
2419 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002420 int longflag;
2421 int longlongflag;
2422 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002423 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002424
2425 p = f;
2426 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002427 zeropad = 0;
2428 if (*f == '0') {
2429 zeropad = 1;
2430 f++;
2431 }
Victor Stinner96865452011-03-01 23:44:09 +00002432
2433 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002434 width = -1;
2435 if (Py_ISDIGIT((unsigned)*f)) {
2436 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002437 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002438 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002440 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002441 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002442 return NULL;
2443 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002444 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002445 f++;
2446 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002447 }
2448 precision = -1;
2449 if (*f == '.') {
2450 f++;
2451 if (Py_ISDIGIT((unsigned)*f)) {
2452 precision = (*f - '0');
2453 f++;
2454 while (Py_ISDIGIT((unsigned)*f)) {
2455 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2456 PyErr_SetString(PyExc_ValueError,
2457 "precision too big");
2458 return NULL;
2459 }
2460 precision = (precision * 10) + (*f - '0');
2461 f++;
2462 }
2463 }
Victor Stinner96865452011-03-01 23:44:09 +00002464 if (*f == '%') {
2465 /* "%.3%s" => f points to "3" */
2466 f--;
2467 }
2468 }
2469 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002470 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002471 f--;
2472 }
Victor Stinner96865452011-03-01 23:44:09 +00002473
2474 /* Handle %ld, %lu, %lld and %llu. */
2475 longflag = 0;
2476 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002477 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002478 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002479 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002480 longflag = 1;
2481 ++f;
2482 }
2483#ifdef HAVE_LONG_LONG
2484 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002485 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002486 longlongflag = 1;
2487 f += 2;
2488 }
2489#endif
2490 }
2491 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002492 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002493 size_tflag = 1;
2494 ++f;
2495 }
Victor Stinnere215d962012-10-06 23:03:36 +02002496
2497 if (f[1] == '\0')
2498 writer->overallocate = 0;
2499
2500 switch (*f) {
2501 case 'c':
2502 {
2503 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002504 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002505 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002506 "character argument not in range(0x110000)");
2507 return NULL;
2508 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002509 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002510 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002511 break;
2512 }
2513
2514 case 'i':
2515 case 'd':
2516 case 'u':
2517 case 'x':
2518 {
2519 /* used by sprintf */
2520 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002521 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002523
2524 if (*f == 'u') {
2525 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2526
2527 if (longflag)
2528 len = sprintf(buffer, fmt,
2529 va_arg(*vargs, unsigned long));
2530#ifdef HAVE_LONG_LONG
2531 else if (longlongflag)
2532 len = sprintf(buffer, fmt,
2533 va_arg(*vargs, unsigned PY_LONG_LONG));
2534#endif
2535 else if (size_tflag)
2536 len = sprintf(buffer, fmt,
2537 va_arg(*vargs, size_t));
2538 else
2539 len = sprintf(buffer, fmt,
2540 va_arg(*vargs, unsigned int));
2541 }
2542 else if (*f == 'x') {
2543 makefmt(fmt, 0, 0, 0, 'x');
2544 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2545 }
2546 else {
2547 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2548
2549 if (longflag)
2550 len = sprintf(buffer, fmt,
2551 va_arg(*vargs, long));
2552#ifdef HAVE_LONG_LONG
2553 else if (longlongflag)
2554 len = sprintf(buffer, fmt,
2555 va_arg(*vargs, PY_LONG_LONG));
2556#endif
2557 else if (size_tflag)
2558 len = sprintf(buffer, fmt,
2559 va_arg(*vargs, Py_ssize_t));
2560 else
2561 len = sprintf(buffer, fmt,
2562 va_arg(*vargs, int));
2563 }
2564 assert(len >= 0);
2565
Victor Stinnere215d962012-10-06 23:03:36 +02002566 if (precision < len)
2567 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002568
2569 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002570 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2571 return NULL;
2572
Victor Stinnere215d962012-10-06 23:03:36 +02002573 if (width > precision) {
2574 Py_UCS4 fillchar;
2575 fill = width - precision;
2576 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002577 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2578 return NULL;
2579 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002580 }
Victor Stinner15a11362012-10-06 23:48:20 +02002581 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002582 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002583 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2584 return NULL;
2585 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002586 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002587
Victor Stinner4a587072013-11-19 12:54:53 +01002588 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2589 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002590 break;
2591 }
2592
2593 case 'p':
2594 {
2595 char number[MAX_LONG_LONG_CHARS];
2596
2597 len = sprintf(number, "%p", va_arg(*vargs, void*));
2598 assert(len >= 0);
2599
2600 /* %p is ill-defined: ensure leading 0x. */
2601 if (number[1] == 'X')
2602 number[1] = 'x';
2603 else if (number[1] != 'x') {
2604 memmove(number + 2, number,
2605 strlen(number) + 1);
2606 number[0] = '0';
2607 number[1] = 'x';
2608 len += 2;
2609 }
2610
Victor Stinner4a587072013-11-19 12:54:53 +01002611 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002612 return NULL;
2613 break;
2614 }
2615
2616 case 's':
2617 {
2618 /* UTF-8 */
2619 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002620 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002621 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002622 break;
2623 }
2624
2625 case 'U':
2626 {
2627 PyObject *obj = va_arg(*vargs, PyObject *);
2628 assert(obj && _PyUnicode_CHECK(obj));
2629
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002631 return NULL;
2632 break;
2633 }
2634
2635 case 'V':
2636 {
2637 PyObject *obj = va_arg(*vargs, PyObject *);
2638 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002639 if (obj) {
2640 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002641 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002642 return NULL;
2643 }
2644 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002645 assert(str != NULL);
2646 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002647 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002648 }
2649 break;
2650 }
2651
2652 case 'S':
2653 {
2654 PyObject *obj = va_arg(*vargs, PyObject *);
2655 PyObject *str;
2656 assert(obj);
2657 str = PyObject_Str(obj);
2658 if (!str)
2659 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002661 Py_DECREF(str);
2662 return NULL;
2663 }
2664 Py_DECREF(str);
2665 break;
2666 }
2667
2668 case 'R':
2669 {
2670 PyObject *obj = va_arg(*vargs, PyObject *);
2671 PyObject *repr;
2672 assert(obj);
2673 repr = PyObject_Repr(obj);
2674 if (!repr)
2675 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002677 Py_DECREF(repr);
2678 return NULL;
2679 }
2680 Py_DECREF(repr);
2681 break;
2682 }
2683
2684 case 'A':
2685 {
2686 PyObject *obj = va_arg(*vargs, PyObject *);
2687 PyObject *ascii;
2688 assert(obj);
2689 ascii = PyObject_ASCII(obj);
2690 if (!ascii)
2691 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002692 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002693 Py_DECREF(ascii);
2694 return NULL;
2695 }
2696 Py_DECREF(ascii);
2697 break;
2698 }
2699
2700 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002701 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002702 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002703 break;
2704
2705 default:
2706 /* if we stumble upon an unknown formatting code, copy the rest
2707 of the format string to the output string. (we cannot just
2708 skip the code, since there's no way to know what's in the
2709 argument list) */
2710 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002711 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002712 return NULL;
2713 f = p+len;
2714 return f;
2715 }
2716
2717 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002718 return f;
2719}
2720
Walter Dörwaldd2034312007-05-18 16:29:38 +00002721PyObject *
2722PyUnicode_FromFormatV(const char *format, va_list vargs)
2723{
Victor Stinnere215d962012-10-06 23:03:36 +02002724 va_list vargs2;
2725 const char *f;
2726 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002727
Victor Stinner8f674cc2013-04-17 23:02:17 +02002728 _PyUnicodeWriter_Init(&writer);
2729 writer.min_length = strlen(format) + 100;
2730 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002731
2732 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2733 Copy it to be able to pass a reference to a subfunction. */
2734 Py_VA_COPY(vargs2, vargs);
2735
2736 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002738 f = unicode_fromformat_arg(&writer, f, &vargs2);
2739 if (f == NULL)
2740 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002743 const char *p;
2744 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002745
Victor Stinnere215d962012-10-06 23:03:36 +02002746 p = f;
2747 do
2748 {
2749 if ((unsigned char)*p > 127) {
2750 PyErr_Format(PyExc_ValueError,
2751 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2752 "string, got a non-ASCII byte: 0x%02x",
2753 (unsigned char)*p);
2754 return NULL;
2755 }
2756 p++;
2757 }
2758 while (*p != '\0' && *p != '%');
2759 len = p - f;
2760
2761 if (*p == '\0')
2762 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002763
2764 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002765 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002766
2767 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 }
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return _PyUnicodeWriter_Finish(&writer);
2771
2772 fail:
2773 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775}
2776
Walter Dörwaldd2034312007-05-18 16:29:38 +00002777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002780 PyObject* ret;
2781 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782
2783#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 ret = PyUnicode_FromFormatV(format, vargs);
2789 va_end(vargs);
2790 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002791}
2792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793#ifdef HAVE_WCHAR_H
2794
Victor Stinner5593d8a2010-10-02 11:11:27 +00002795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796 convert a Unicode object to a wide character string.
2797
Victor Stinnerd88d9832011-09-06 02:00:05 +02002798 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002799 character) required to convert the unicode object. Ignore size argument.
2800
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002803 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002806 wchar_t *w,
2807 Py_ssize_t size)
2808{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 const wchar_t *wstr;
2811
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (wstr == NULL)
2814 return -1;
2815
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002817 if (size > res)
2818 size = res + 1;
2819 else
2820 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002822 return res;
2823 }
2824 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002826}
2827
2828Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002829PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 wchar_t *w,
2831 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832{
2833 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 PyErr_BadInternalCall();
2835 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002837 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838}
2839
Victor Stinner137c34c2010-09-29 10:25:54 +00002840wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002841PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002842 Py_ssize_t *size)
2843{
2844 wchar_t* buffer;
2845 Py_ssize_t buflen;
2846
2847 if (unicode == NULL) {
2848 PyErr_BadInternalCall();
2849 return NULL;
2850 }
2851
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002852 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002853 if (buflen == -1)
2854 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002855 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002856 PyErr_NoMemory();
2857 return NULL;
2858 }
2859
Victor Stinner137c34c2010-09-29 10:25:54 +00002860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2861 if (buffer == NULL) {
2862 PyErr_NoMemory();
2863 return NULL;
2864 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002865 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002866 if (buflen == -1) {
2867 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002869 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 if (size != NULL)
2871 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002872 return buffer;
2873}
2874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002875#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Alexander Belopolsky40018472011-02-26 01:02:56 +00002877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002879{
Victor Stinner8faf8212011-12-08 22:14:11 +01002880 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 PyErr_SetString(PyExc_ValueError,
2882 "chr() arg not in range(0x110000)");
2883 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002884 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002885
Victor Stinner985a82a2014-01-03 12:53:47 +01002886 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002887}
2888
Alexander Belopolsky40018472011-02-26 01:02:56 +00002889PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002890PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002894 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002895 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002896 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 Py_INCREF(obj);
2898 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002899 }
2900 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 /* For a Unicode subtype that's not a Unicode object,
2902 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002903 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002904 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002905 PyErr_Format(PyExc_TypeError,
2906 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002907 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002908 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002909}
2910
Alexander Belopolsky40018472011-02-26 01:02:56 +00002911PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002912PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002913 const char *encoding,
2914 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002915{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002916 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002917 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002918
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 PyErr_BadInternalCall();
2921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002923
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002924 /* Decoding bytes objects is the most common case and should be fast */
2925 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002926 if (PyBytes_GET_SIZE(obj) == 0)
2927 _Py_RETURN_UNICODE_EMPTY();
2928 v = PyUnicode_Decode(
2929 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2930 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002931 return v;
2932 }
2933
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002934 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002935 PyErr_SetString(PyExc_TypeError,
2936 "decoding str is not supported");
2937 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002939
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002940 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2941 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2942 PyErr_Format(PyExc_TypeError,
2943 "coercing to str: need bytes, bytearray "
2944 "or buffer-like object, %.80s found",
2945 Py_TYPE(obj)->tp_name);
2946 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002947 }
Tim Petersced69f82003-09-16 20:30:58 +00002948
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002949 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002950 PyBuffer_Release(&buffer);
2951 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002953
Serhiy Storchaka05997252013-01-26 12:14:02 +02002954 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002955 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002956 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957}
2958
Victor Stinner600d3be2010-06-10 12:00:55 +00002959/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002960 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2961 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002962int
2963_Py_normalize_encoding(const char *encoding,
2964 char *lower,
2965 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002967 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002968 char *l;
2969 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002970
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002971 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002972 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002973 if (lower_len < 6)
2974 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002975 strcpy(lower, "utf-8");
2976 return 1;
2977 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002978 e = encoding;
2979 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002980 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002981 while (*e) {
2982 if (l == l_end)
2983 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002984 if (Py_ISUPPER(*e)) {
2985 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002986 }
2987 else if (*e == '_') {
2988 *l++ = '-';
2989 e++;
2990 }
2991 else {
2992 *l++ = *e++;
2993 }
2994 }
2995 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002996 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002997}
2998
Alexander Belopolsky40018472011-02-26 01:02:56 +00002999PyObject *
3000PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003001 Py_ssize_t size,
3002 const char *encoding,
3003 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003004{
3005 PyObject *buffer = NULL, *unicode;
3006 Py_buffer info;
3007 char lower[11]; /* Enough for any encoding shortcut */
3008
Fred Drakee4315f52000-05-09 19:53:39 +00003009 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003010 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003011 if ((strcmp(lower, "utf-8") == 0) ||
3012 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003013 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003014 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003015 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003016 (strcmp(lower, "iso-8859-1") == 0) ||
3017 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003018 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003019#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003020 else if (strcmp(lower, "mbcs") == 0)
3021 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003022#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003023 else if (strcmp(lower, "ascii") == 0)
3024 return PyUnicode_DecodeASCII(s, size, errors);
3025 else if (strcmp(lower, "utf-16") == 0)
3026 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3027 else if (strcmp(lower, "utf-32") == 0)
3028 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030
3031 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003032 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003033 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003034 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003035 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 if (buffer == NULL)
3037 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003038 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 if (unicode == NULL)
3040 goto onError;
3041 if (!PyUnicode_Check(unicode)) {
3042 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003043 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3044 "use codecs.decode() to decode to arbitrary types",
3045 encoding,
3046 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 Py_DECREF(unicode);
3048 goto onError;
3049 }
3050 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003051 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003052
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 Py_XDECREF(buffer);
3055 return NULL;
3056}
3057
Alexander Belopolsky40018472011-02-26 01:02:56 +00003058PyObject *
3059PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003060 const char *encoding,
3061 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003062{
3063 PyObject *v;
3064
3065 if (!PyUnicode_Check(unicode)) {
3066 PyErr_BadArgument();
3067 goto onError;
3068 }
3069
3070 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003072
3073 /* Decode via the codec registry */
3074 v = PyCodec_Decode(unicode, encoding, errors);
3075 if (v == NULL)
3076 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003077 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003078
Benjamin Peterson29060642009-01-31 22:14:21 +00003079 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003080 return NULL;
3081}
3082
Alexander Belopolsky40018472011-02-26 01:02:56 +00003083PyObject *
3084PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003085 const char *encoding,
3086 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003087{
3088 PyObject *v;
3089
3090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
3092 goto onError;
3093 }
3094
3095 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003096 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003097
3098 /* Decode via the codec registry */
3099 v = PyCodec_Decode(unicode, encoding, errors);
3100 if (v == NULL)
3101 goto onError;
3102 if (!PyUnicode_Check(v)) {
3103 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003104 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3105 "use codecs.decode() to decode to arbitrary types",
3106 encoding,
3107 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003108 Py_DECREF(v);
3109 goto onError;
3110 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003111 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003112
Benjamin Peterson29060642009-01-31 22:14:21 +00003113 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003114 return NULL;
3115}
3116
Alexander Belopolsky40018472011-02-26 01:02:56 +00003117PyObject *
3118PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003119 Py_ssize_t size,
3120 const char *encoding,
3121 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122{
3123 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003124
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125 unicode = PyUnicode_FromUnicode(s, size);
3126 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3129 Py_DECREF(unicode);
3130 return v;
3131}
3132
Alexander Belopolsky40018472011-02-26 01:02:56 +00003133PyObject *
3134PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003135 const char *encoding,
3136 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003137{
3138 PyObject *v;
3139
3140 if (!PyUnicode_Check(unicode)) {
3141 PyErr_BadArgument();
3142 goto onError;
3143 }
3144
3145 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003147
3148 /* Encode via the codec registry */
3149 v = PyCodec_Encode(unicode, encoding, errors);
3150 if (v == NULL)
3151 goto onError;
3152 return v;
3153
Benjamin Peterson29060642009-01-31 22:14:21 +00003154 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003155 return NULL;
3156}
3157
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003158static size_t
3159wcstombs_errorpos(const wchar_t *wstr)
3160{
3161 size_t len;
3162#if SIZEOF_WCHAR_T == 2
3163 wchar_t buf[3];
3164#else
3165 wchar_t buf[2];
3166#endif
3167 char outbuf[MB_LEN_MAX];
3168 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003169
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003170#if SIZEOF_WCHAR_T == 2
3171 buf[2] = 0;
3172#else
3173 buf[1] = 0;
3174#endif
3175 start = wstr;
3176 while (*wstr != L'\0')
3177 {
3178 previous = wstr;
3179#if SIZEOF_WCHAR_T == 2
3180 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3181 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3182 {
3183 buf[0] = wstr[0];
3184 buf[1] = wstr[1];
3185 wstr += 2;
3186 }
3187 else {
3188 buf[0] = *wstr;
3189 buf[1] = 0;
3190 wstr++;
3191 }
3192#else
3193 buf[0] = *wstr;
3194 wstr++;
3195#endif
3196 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003197 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003198 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003199 }
3200
3201 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003202 return 0;
3203}
3204
Victor Stinner1b579672011-12-17 05:47:23 +01003205static int
3206locale_error_handler(const char *errors, int *surrogateescape)
3207{
3208 if (errors == NULL) {
3209 *surrogateescape = 0;
3210 return 0;
3211 }
3212
3213 if (strcmp(errors, "strict") == 0) {
3214 *surrogateescape = 0;
3215 return 0;
3216 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003217 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003218 *surrogateescape = 1;
3219 return 0;
3220 }
3221 PyErr_Format(PyExc_ValueError,
3222 "only 'strict' and 'surrogateescape' error handlers "
3223 "are supported, not '%s'",
3224 errors);
3225 return -1;
3226}
3227
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003228PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003229PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003230{
3231 Py_ssize_t wlen, wlen2;
3232 wchar_t *wstr;
3233 PyObject *bytes = NULL;
3234 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003235 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003236 PyObject *exc;
3237 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003238 int surrogateescape;
3239
3240 if (locale_error_handler(errors, &surrogateescape) < 0)
3241 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003242
3243 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3244 if (wstr == NULL)
3245 return NULL;
3246
3247 wlen2 = wcslen(wstr);
3248 if (wlen2 != wlen) {
3249 PyMem_Free(wstr);
3250 PyErr_SetString(PyExc_TypeError, "embedded null character");
3251 return NULL;
3252 }
3253
3254 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003255 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003256 char *str;
3257
3258 str = _Py_wchar2char(wstr, &error_pos);
3259 if (str == NULL) {
3260 if (error_pos == (size_t)-1) {
3261 PyErr_NoMemory();
3262 PyMem_Free(wstr);
3263 return NULL;
3264 }
3265 else {
3266 goto encode_error;
3267 }
3268 }
3269 PyMem_Free(wstr);
3270
3271 bytes = PyBytes_FromString(str);
3272 PyMem_Free(str);
3273 }
3274 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003275 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003276 size_t len, len2;
3277
3278 len = wcstombs(NULL, wstr, 0);
3279 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003280 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003281 goto encode_error;
3282 }
3283
3284 bytes = PyBytes_FromStringAndSize(NULL, len);
3285 if (bytes == NULL) {
3286 PyMem_Free(wstr);
3287 return NULL;
3288 }
3289
3290 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3291 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003292 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003293 goto encode_error;
3294 }
3295 PyMem_Free(wstr);
3296 }
3297 return bytes;
3298
3299encode_error:
3300 errmsg = strerror(errno);
3301 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003302
3303 if (error_pos == (size_t)-1)
3304 error_pos = wcstombs_errorpos(wstr);
3305
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003306 PyMem_Free(wstr);
3307 Py_XDECREF(bytes);
3308
Victor Stinner2f197072011-12-17 07:08:30 +01003309 if (errmsg != NULL) {
3310 size_t errlen;
3311 wstr = _Py_char2wchar(errmsg, &errlen);
3312 if (wstr != NULL) {
3313 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003314 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003315 } else
3316 errmsg = NULL;
3317 }
3318 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003319 reason = PyUnicode_FromString(
3320 "wcstombs() encountered an unencodable "
3321 "wide character");
3322 if (reason == NULL)
3323 return NULL;
3324
3325 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3326 "locale", unicode,
3327 (Py_ssize_t)error_pos,
3328 (Py_ssize_t)(error_pos+1),
3329 reason);
3330 Py_DECREF(reason);
3331 if (exc != NULL) {
3332 PyCodec_StrictErrors(exc);
3333 Py_XDECREF(exc);
3334 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003335 return NULL;
3336}
3337
Victor Stinnerad158722010-10-27 00:25:46 +00003338PyObject *
3339PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003340{
Victor Stinner99b95382011-07-04 14:23:54 +02003341#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003342 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003343#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003344 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003345#else
Victor Stinner793b5312011-04-27 00:24:21 +02003346 PyInterpreterState *interp = PyThreadState_GET()->interp;
3347 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3348 cannot use it to encode and decode filenames before it is loaded. Load
3349 the Python codec requires to encode at least its own filename. Use the C
3350 version of the locale codec until the codec registry is initialized and
3351 the Python codec is loaded.
3352
3353 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3354 cannot only rely on it: check also interp->fscodec_initialized for
3355 subinterpreters. */
3356 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003357 return PyUnicode_AsEncodedString(unicode,
3358 Py_FileSystemDefaultEncoding,
3359 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003360 }
3361 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003362 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003363 }
Victor Stinnerad158722010-10-27 00:25:46 +00003364#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003365}
3366
Alexander Belopolsky40018472011-02-26 01:02:56 +00003367PyObject *
3368PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003369 const char *encoding,
3370 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371{
3372 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003373 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003374
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375 if (!PyUnicode_Check(unicode)) {
3376 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 }
Fred Drakee4315f52000-05-09 19:53:39 +00003379
Fred Drakee4315f52000-05-09 19:53:39 +00003380 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003381 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003382 if ((strcmp(lower, "utf-8") == 0) ||
3383 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003384 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003385 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003387 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003388 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003389 }
Victor Stinner37296e82010-06-10 13:36:23 +00003390 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003391 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003392 (strcmp(lower, "iso-8859-1") == 0) ||
3393 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003394 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003395#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003396 else if (strcmp(lower, "mbcs") == 0)
3397 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003398#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003399 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402
3403 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003404 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003406 return NULL;
3407
3408 /* The normal path */
3409 if (PyBytes_Check(v))
3410 return v;
3411
3412 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003413 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003414 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003415 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003416
3417 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003418 "encoder %s returned bytearray instead of bytes; "
3419 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003420 encoding);
3421 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003422 Py_DECREF(v);
3423 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003424 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003425
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003426 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3427 Py_DECREF(v);
3428 return b;
3429 }
3430
3431 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003432 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3433 "use codecs.encode() to encode to arbitrary types",
3434 encoding,
3435 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003436 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003437 return NULL;
3438}
3439
Alexander Belopolsky40018472011-02-26 01:02:56 +00003440PyObject *
3441PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003442 const char *encoding,
3443 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003444{
3445 PyObject *v;
3446
3447 if (!PyUnicode_Check(unicode)) {
3448 PyErr_BadArgument();
3449 goto onError;
3450 }
3451
3452 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003453 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003454
3455 /* Encode via the codec registry */
3456 v = PyCodec_Encode(unicode, encoding, errors);
3457 if (v == NULL)
3458 goto onError;
3459 if (!PyUnicode_Check(v)) {
3460 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003461 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3462 "use codecs.encode() to encode to arbitrary types",
3463 encoding,
3464 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003465 Py_DECREF(v);
3466 goto onError;
3467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003469
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 return NULL;
3472}
3473
Victor Stinner2f197072011-12-17 07:08:30 +01003474static size_t
3475mbstowcs_errorpos(const char *str, size_t len)
3476{
3477#ifdef HAVE_MBRTOWC
3478 const char *start = str;
3479 mbstate_t mbs;
3480 size_t converted;
3481 wchar_t ch;
3482
3483 memset(&mbs, 0, sizeof mbs);
3484 while (len)
3485 {
3486 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3487 if (converted == 0)
3488 /* Reached end of string */
3489 break;
3490 if (converted == (size_t)-1 || converted == (size_t)-2) {
3491 /* Conversion error or incomplete character */
3492 return str - start;
3493 }
3494 else {
3495 str += converted;
3496 len -= converted;
3497 }
3498 }
3499 /* failed to find the undecodable byte sequence */
3500 return 0;
3501#endif
3502 return 0;
3503}
3504
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003505PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003507 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003508{
3509 wchar_t smallbuf[256];
3510 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3511 wchar_t *wstr;
3512 size_t wlen, wlen2;
3513 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003514 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003515 size_t error_pos;
3516 char *errmsg;
3517 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003518
3519 if (locale_error_handler(errors, &surrogateescape) < 0)
3520 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003521
3522 if (str[len] != '\0' || len != strlen(str)) {
3523 PyErr_SetString(PyExc_TypeError, "embedded null character");
3524 return NULL;
3525 }
3526
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003527 if (surrogateescape) {
3528 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003529 wstr = _Py_char2wchar(str, &wlen);
3530 if (wstr == NULL) {
3531 if (wlen == (size_t)-1)
3532 PyErr_NoMemory();
3533 else
3534 PyErr_SetFromErrno(PyExc_OSError);
3535 return NULL;
3536 }
3537
3538 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003539 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003540 }
3541 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003542 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003543#ifndef HAVE_BROKEN_MBSTOWCS
3544 wlen = mbstowcs(NULL, str, 0);
3545#else
3546 wlen = len;
3547#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003548 if (wlen == (size_t)-1)
3549 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003550 if (wlen+1 <= smallbuf_len) {
3551 wstr = smallbuf;
3552 }
3553 else {
3554 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3555 return PyErr_NoMemory();
3556
3557 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3558 if (!wstr)
3559 return PyErr_NoMemory();
3560 }
3561
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003562 wlen2 = mbstowcs(wstr, str, wlen+1);
3563 if (wlen2 == (size_t)-1) {
3564 if (wstr != smallbuf)
3565 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003566 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003567 }
3568#ifdef HAVE_BROKEN_MBSTOWCS
3569 assert(wlen2 == wlen);
3570#endif
3571 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3572 if (wstr != smallbuf)
3573 PyMem_Free(wstr);
3574 }
3575 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003576
3577decode_error:
3578 errmsg = strerror(errno);
3579 assert(errmsg != NULL);
3580
3581 error_pos = mbstowcs_errorpos(str, len);
3582 if (errmsg != NULL) {
3583 size_t errlen;
3584 wstr = _Py_char2wchar(errmsg, &errlen);
3585 if (wstr != NULL) {
3586 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003587 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003588 } else
3589 errmsg = NULL;
3590 }
3591 if (errmsg == NULL)
3592 reason = PyUnicode_FromString(
3593 "mbstowcs() encountered an invalid multibyte sequence");
3594 if (reason == NULL)
3595 return NULL;
3596
3597 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3598 "locale", str, len,
3599 (Py_ssize_t)error_pos,
3600 (Py_ssize_t)(error_pos+1),
3601 reason);
3602 Py_DECREF(reason);
3603 if (exc != NULL) {
3604 PyCodec_StrictErrors(exc);
3605 Py_XDECREF(exc);
3606 }
3607 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003608}
3609
3610PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003611PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003612{
3613 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003614 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003615}
3616
3617
3618PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003619PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003620 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003621 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3622}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003623
Christian Heimes5894ba72007-11-04 11:43:14 +00003624PyObject*
3625PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3626{
Victor Stinner99b95382011-07-04 14:23:54 +02003627#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003628 return PyUnicode_DecodeMBCS(s, size, NULL);
3629#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003630 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003631#else
Victor Stinner793b5312011-04-27 00:24:21 +02003632 PyInterpreterState *interp = PyThreadState_GET()->interp;
3633 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3634 cannot use it to encode and decode filenames before it is loaded. Load
3635 the Python codec requires to encode at least its own filename. Use the C
3636 version of the locale codec until the codec registry is initialized and
3637 the Python codec is loaded.
3638
3639 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3640 cannot only rely on it: check also interp->fscodec_initialized for
3641 subinterpreters. */
3642 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003643 return PyUnicode_Decode(s, size,
3644 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003645 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003646 }
3647 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003648 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003649 }
Victor Stinnerad158722010-10-27 00:25:46 +00003650#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003651}
3652
Martin v. Löwis011e8422009-05-05 04:43:17 +00003653
3654int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003655_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003656{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003657 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003658
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003659 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003660 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003661 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3662 PyUnicode_GET_LENGTH(str), '\0', 1);
3663 if (pos == -1)
3664 return 0;
3665 else
3666 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003667}
3668
Antoine Pitrou13348842012-01-29 18:36:34 +01003669int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003670PyUnicode_FSConverter(PyObject* arg, void* addr)
3671{
3672 PyObject *output = NULL;
3673 Py_ssize_t size;
3674 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003675 if (arg == NULL) {
3676 Py_DECREF(*(PyObject**)addr);
3677 return 1;
3678 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003679 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003680 output = arg;
3681 Py_INCREF(output);
3682 }
3683 else {
3684 arg = PyUnicode_FromObject(arg);
3685 if (!arg)
3686 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003687 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003688 Py_DECREF(arg);
3689 if (!output)
3690 return 0;
3691 if (!PyBytes_Check(output)) {
3692 Py_DECREF(output);
3693 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3694 return 0;
3695 }
3696 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003697 size = PyBytes_GET_SIZE(output);
3698 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003699 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003700 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003701 Py_DECREF(output);
3702 return 0;
3703 }
3704 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003705 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003706}
3707
3708
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003709int
3710PyUnicode_FSDecoder(PyObject* arg, void* addr)
3711{
3712 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003713 if (arg == NULL) {
3714 Py_DECREF(*(PyObject**)addr);
3715 return 1;
3716 }
3717 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003718 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003719 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003720 output = arg;
3721 Py_INCREF(output);
3722 }
3723 else {
3724 arg = PyBytes_FromObject(arg);
3725 if (!arg)
3726 return 0;
3727 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3728 PyBytes_GET_SIZE(arg));
3729 Py_DECREF(arg);
3730 if (!output)
3731 return 0;
3732 if (!PyUnicode_Check(output)) {
3733 Py_DECREF(output);
3734 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3735 return 0;
3736 }
3737 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003738 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003739 Py_DECREF(output);
3740 return 0;
3741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003742 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003743 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003744 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3745 Py_DECREF(output);
3746 return 0;
3747 }
3748 *(PyObject**)addr = output;
3749 return Py_CLEANUP_SUPPORTED;
3750}
3751
3752
Martin v. Löwis5b222132007-06-10 09:51:05 +00003753char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003754PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003755{
Christian Heimesf3863112007-11-22 07:46:41 +00003756 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003757
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003758 if (!PyUnicode_Check(unicode)) {
3759 PyErr_BadArgument();
3760 return NULL;
3761 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003762 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003763 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003765 if (PyUnicode_UTF8(unicode) == NULL) {
3766 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3768 if (bytes == NULL)
3769 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003770 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3771 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003772 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 Py_DECREF(bytes);
3774 return NULL;
3775 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3777 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3778 PyBytes_AS_STRING(bytes),
3779 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003780 Py_DECREF(bytes);
3781 }
3782
3783 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003784 *psize = PyUnicode_UTF8_LENGTH(unicode);
3785 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003786}
3787
3788char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3792}
3793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794Py_UNICODE *
3795PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 const unsigned char *one_byte;
3798#if SIZEOF_WCHAR_T == 4
3799 const Py_UCS2 *two_bytes;
3800#else
3801 const Py_UCS4 *four_bytes;
3802 const Py_UCS4 *ucs4_end;
3803 Py_ssize_t num_surrogates;
3804#endif
3805 wchar_t *w;
3806 wchar_t *wchar_end;
3807
3808 if (!PyUnicode_Check(unicode)) {
3809 PyErr_BadArgument();
3810 return NULL;
3811 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003812 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003813 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003814 assert(_PyUnicode_KIND(unicode) != 0);
3815 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003819 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3820 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 num_surrogates = 0;
3822
3823 for (; four_bytes < ucs4_end; ++four_bytes) {
3824 if (*four_bytes > 0xFFFF)
3825 ++num_surrogates;
3826 }
3827
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003828 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3829 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3830 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831 PyErr_NoMemory();
3832 return NULL;
3833 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003836 w = _PyUnicode_WSTR(unicode);
3837 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3838 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3840 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003841 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003843 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3844 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 }
3846 else
3847 *w = *four_bytes;
3848
3849 if (w > wchar_end) {
3850 assert(0 && "Miscalculated string end");
3851 }
3852 }
3853 *w = 0;
3854#else
3855 /* sizeof(wchar_t) == 4 */
3856 Py_FatalError("Impossible unicode object state, wstr and str "
3857 "should share memory already.");
3858 return NULL;
3859#endif
3860 }
3861 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003862 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3863 (_PyUnicode_LENGTH(unicode) + 1));
3864 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 PyErr_NoMemory();
3866 return NULL;
3867 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003868 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3869 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3870 w = _PyUnicode_WSTR(unicode);
3871 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003873 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3874 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 for (; w < wchar_end; ++one_byte, ++w)
3876 *w = *one_byte;
3877 /* null-terminate the wstr */
3878 *w = 0;
3879 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 for (; w < wchar_end; ++two_bytes, ++w)
3884 *w = *two_bytes;
3885 /* null-terminate the wstr */
3886 *w = 0;
3887#else
3888 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003889 PyObject_FREE(_PyUnicode_WSTR(unicode));
3890 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891 Py_FatalError("Impossible unicode object state, wstr "
3892 "and str should share memory already.");
3893 return NULL;
3894#endif
3895 }
3896 else {
3897 assert(0 && "This should never happen.");
3898 }
3899 }
3900 }
3901 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003902 *size = PyUnicode_WSTR_LENGTH(unicode);
3903 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003904}
3905
Alexander Belopolsky40018472011-02-26 01:02:56 +00003906Py_UNICODE *
3907PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910}
3911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912
Alexander Belopolsky40018472011-02-26 01:02:56 +00003913Py_ssize_t
3914PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915{
3916 if (!PyUnicode_Check(unicode)) {
3917 PyErr_BadArgument();
3918 goto onError;
3919 }
3920 return PyUnicode_GET_SIZE(unicode);
3921
Benjamin Peterson29060642009-01-31 22:14:21 +00003922 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 return -1;
3924}
3925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926Py_ssize_t
3927PyUnicode_GetLength(PyObject *unicode)
3928{
Victor Stinner07621332012-06-16 04:53:46 +02003929 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003930 PyErr_BadArgument();
3931 return -1;
3932 }
Victor Stinner07621332012-06-16 04:53:46 +02003933 if (PyUnicode_READY(unicode) == -1)
3934 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935 return PyUnicode_GET_LENGTH(unicode);
3936}
3937
3938Py_UCS4
3939PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3940{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003941 void *data;
3942 int kind;
3943
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003944 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3945 PyErr_BadArgument();
3946 return (Py_UCS4)-1;
3947 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003948 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003949 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 return (Py_UCS4)-1;
3951 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003952 data = PyUnicode_DATA(unicode);
3953 kind = PyUnicode_KIND(unicode);
3954 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955}
3956
3957int
3958PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3959{
3960 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003961 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 return -1;
3963 }
Victor Stinner488fa492011-12-12 00:01:39 +01003964 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003965 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003966 PyErr_SetString(PyExc_IndexError, "string index out of range");
3967 return -1;
3968 }
Victor Stinner488fa492011-12-12 00:01:39 +01003969 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003970 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003971 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3972 PyErr_SetString(PyExc_ValueError, "character out of range");
3973 return -1;
3974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3976 index, ch);
3977 return 0;
3978}
3979
Alexander Belopolsky40018472011-02-26 01:02:56 +00003980const char *
3981PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003982{
Victor Stinner42cb4622010-09-01 19:39:01 +00003983 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003984}
3985
Victor Stinner554f3f02010-06-16 23:33:54 +00003986/* create or adjust a UnicodeDecodeError */
3987static void
3988make_decode_exception(PyObject **exceptionObject,
3989 const char *encoding,
3990 const char *input, Py_ssize_t length,
3991 Py_ssize_t startpos, Py_ssize_t endpos,
3992 const char *reason)
3993{
3994 if (*exceptionObject == NULL) {
3995 *exceptionObject = PyUnicodeDecodeError_Create(
3996 encoding, input, length, startpos, endpos, reason);
3997 }
3998 else {
3999 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4000 goto onError;
4001 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4002 goto onError;
4003 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4004 goto onError;
4005 }
4006 return;
4007
4008onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004009 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004010}
4011
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004012#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013/* error handling callback helper:
4014 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004015 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 and adjust various state variables.
4017 return 0 on success, -1 on error
4018*/
4019
Alexander Belopolsky40018472011-02-26 01:02:56 +00004020static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004021unicode_decode_call_errorhandler_wchar(
4022 const char *errors, PyObject **errorHandler,
4023 const char *encoding, const char *reason,
4024 const char **input, const char **inend, Py_ssize_t *startinpos,
4025 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4026 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004028 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029
4030 PyObject *restuple = NULL;
4031 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004032 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004033 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004034 Py_ssize_t requiredsize;
4035 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004036 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004037 wchar_t *repwstr;
4038 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004040 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4041 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004042
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004044 *errorHandler = PyCodec_LookupError(errors);
4045 if (*errorHandler == NULL)
4046 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 }
4048
Victor Stinner554f3f02010-06-16 23:33:54 +00004049 make_decode_exception(exceptionObject,
4050 encoding,
4051 *input, *inend - *input,
4052 *startinpos, *endinpos,
4053 reason);
4054 if (*exceptionObject == NULL)
4055 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056
4057 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4058 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004061 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 }
4064 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004066
4067 /* Copy back the bytes variables, which might have been modified by the
4068 callback */
4069 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4070 if (!inputobj)
4071 goto onError;
4072 if (!PyBytes_Check(inputobj)) {
4073 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4074 }
4075 *input = PyBytes_AS_STRING(inputobj);
4076 insize = PyBytes_GET_SIZE(inputobj);
4077 *inend = *input + insize;
4078 /* we can DECREF safely, as the exception has another reference,
4079 so the object won't go away. */
4080 Py_DECREF(inputobj);
4081
4082 if (newpos<0)
4083 newpos = insize+newpos;
4084 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004085 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004086 goto onError;
4087 }
4088
4089 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4090 if (repwstr == NULL)
4091 goto onError;
4092 /* need more space? (at least enough for what we
4093 have+the replacement+the rest of the string (starting
4094 at the new input position), so we won't have to check space
4095 when there are no errors in the rest of the string) */
4096 requiredsize = *outpos + repwlen + insize-newpos;
4097 if (requiredsize > outsize) {
4098 if (requiredsize < 2*outsize)
4099 requiredsize = 2*outsize;
4100 if (unicode_resize(output, requiredsize) < 0)
4101 goto onError;
4102 }
4103 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4104 *outpos += repwlen;
4105
4106 *endinpos = newpos;
4107 *inptr = *input + newpos;
4108
4109 /* we made it! */
4110 Py_XDECREF(restuple);
4111 return 0;
4112
4113 onError:
4114 Py_XDECREF(restuple);
4115 return -1;
4116}
4117#endif /* HAVE_MBCS */
4118
4119static int
4120unicode_decode_call_errorhandler_writer(
4121 const char *errors, PyObject **errorHandler,
4122 const char *encoding, const char *reason,
4123 const char **input, const char **inend, Py_ssize_t *startinpos,
4124 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4125 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4126{
4127 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4128
4129 PyObject *restuple = NULL;
4130 PyObject *repunicode = NULL;
4131 Py_ssize_t insize;
4132 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004133 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004134 PyObject *inputobj = NULL;
4135
4136 if (*errorHandler == NULL) {
4137 *errorHandler = PyCodec_LookupError(errors);
4138 if (*errorHandler == NULL)
4139 goto onError;
4140 }
4141
4142 make_decode_exception(exceptionObject,
4143 encoding,
4144 *input, *inend - *input,
4145 *startinpos, *endinpos,
4146 reason);
4147 if (*exceptionObject == NULL)
4148 goto onError;
4149
4150 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4151 if (restuple == NULL)
4152 goto onError;
4153 if (!PyTuple_Check(restuple)) {
4154 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4155 goto onError;
4156 }
4157 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004158 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004159
4160 /* Copy back the bytes variables, which might have been modified by the
4161 callback */
4162 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4163 if (!inputobj)
4164 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004165 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004167 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004168 *input = PyBytes_AS_STRING(inputobj);
4169 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004170 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004171 /* we can DECREF safely, as the exception has another reference,
4172 so the object won't go away. */
4173 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004174
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004177 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004178 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004180 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181
Victor Stinner8f674cc2013-04-17 23:02:17 +02004182 if (PyUnicode_READY(repunicode) < 0)
4183 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004184 replen = PyUnicode_GET_LENGTH(repunicode);
4185 writer->min_length += replen;
4186 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004187 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004189 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004192 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004195 Py_XDECREF(restuple);
4196 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004200 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201}
4202
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004203/* --- UTF-7 Codec -------------------------------------------------------- */
4204
Antoine Pitrou244651a2009-05-04 18:56:13 +00004205/* See RFC2152 for details. We encode conservatively and decode liberally. */
4206
4207/* Three simple macros defining base-64. */
4208
4209/* Is c a base-64 character? */
4210
4211#define IS_BASE64(c) \
4212 (((c) >= 'A' && (c) <= 'Z') || \
4213 ((c) >= 'a' && (c) <= 'z') || \
4214 ((c) >= '0' && (c) <= '9') || \
4215 (c) == '+' || (c) == '/')
4216
4217/* given that c is a base-64 character, what is its base-64 value? */
4218
4219#define FROM_BASE64(c) \
4220 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4221 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4222 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4223 (c) == '+' ? 62 : 63)
4224
4225/* What is the base-64 character of the bottom 6 bits of n? */
4226
4227#define TO_BASE64(n) \
4228 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4229
4230/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4231 * decoded as itself. We are permissive on decoding; the only ASCII
4232 * byte not decoding to itself is the + which begins a base64
4233 * string. */
4234
4235#define DECODE_DIRECT(c) \
4236 ((c) <= 127 && (c) != '+')
4237
4238/* The UTF-7 encoder treats ASCII characters differently according to
4239 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4240 * the above). See RFC2152. This array identifies these different
4241 * sets:
4242 * 0 : "Set D"
4243 * alphanumeric and '(),-./:?
4244 * 1 : "Set O"
4245 * !"#$%&*;<=>@[]^_`{|}
4246 * 2 : "whitespace"
4247 * ht nl cr sp
4248 * 3 : special (must be base64 encoded)
4249 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4250 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004251
Tim Petersced69f82003-09-16 20:30:58 +00004252static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004253char utf7_category[128] = {
4254/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4255 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4256/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4257 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4258/* sp ! " # $ % & ' ( ) * + , - . / */
4259 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4260/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4262/* @ A B C D E F G H I J K L M N O */
4263 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4264/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4266/* ` a b c d e f g h i j k l m n o */
4267 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4268/* p q r s t u v w x y z { | } ~ del */
4269 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270};
4271
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272/* ENCODE_DIRECT: this character should be encoded as itself. The
4273 * answer depends on whether we are encoding set O as itself, and also
4274 * on whether we are encoding whitespace as itself. RFC2152 makes it
4275 * clear that the answers to these questions vary between
4276 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004277
Antoine Pitrou244651a2009-05-04 18:56:13 +00004278#define ENCODE_DIRECT(c, directO, directWS) \
4279 ((c) < 128 && (c) > 0 && \
4280 ((utf7_category[(c)] == 0) || \
4281 (directWS && (utf7_category[(c)] == 2)) || \
4282 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004283
Alexander Belopolsky40018472011-02-26 01:02:56 +00004284PyObject *
4285PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004286 Py_ssize_t size,
4287 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004289 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4290}
4291
Antoine Pitrou244651a2009-05-04 18:56:13 +00004292/* The decoder. The only state we preserve is our read position,
4293 * i.e. how many characters we have consumed. So if we end in the
4294 * middle of a shift sequence we have to back off the read position
4295 * and the output to the beginning of the sequence, otherwise we lose
4296 * all the shift state (seen bits, number of bits seen, high
4297 * surrogate). */
4298
Alexander Belopolsky40018472011-02-26 01:02:56 +00004299PyObject *
4300PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004301 Py_ssize_t size,
4302 const char *errors,
4303 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004304{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004306 Py_ssize_t startinpos;
4307 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310 const char *errmsg = "";
4311 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004312 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313 unsigned int base64bits = 0;
4314 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004315 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 PyObject *errorHandler = NULL;
4317 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004319 if (size == 0) {
4320 if (consumed)
4321 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004322 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004323 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004326 _PyUnicodeWriter_Init(&writer);
4327 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328
4329 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330 e = s + size;
4331
4332 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004335 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 if (inShift) { /* in a base-64 section */
4338 if (IS_BASE64(ch)) { /* consume a base-64 character */
4339 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4340 base64bits += 6;
4341 s++;
4342 if (base64bits >= 16) {
4343 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004344 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 base64bits -= 16;
4346 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004347 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 if (surrogate) {
4349 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004350 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4351 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004352 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004353 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004355 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004356 }
4357 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004358 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004359 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 }
4362 }
Victor Stinner551ac952011-11-29 22:58:13 +01004363 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 /* first surrogate */
4365 surrogate = outCh;
4366 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004368 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004369 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 }
4371 }
4372 }
4373 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 inShift = 0;
4375 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004377 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004378 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004379 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 if (base64bits > 0) { /* left-over bits */
4382 if (base64bits >= 6) {
4383 /* We've seen at least one base-64 character */
4384 errmsg = "partial character in shift sequence";
4385 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 else {
4388 /* Some bits remain; they should be zero */
4389 if (base64buffer != 0) {
4390 errmsg = "non-zero padding bits in shift sequence";
4391 goto utf7Error;
4392 }
4393 }
4394 }
4395 if (ch != '-') {
4396 /* '-' is absorbed; other terminating
4397 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004398 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004399 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 }
4402 }
4403 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405 s++; /* consume '+' */
4406 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004407 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004408 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004409 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 }
4411 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004414 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004415 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
4417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004420 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004421 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 else {
4424 startinpos = s-starts;
4425 s++;
4426 errmsg = "unexpected special character";
4427 goto utf7Error;
4428 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004432 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 errors, &errorHandler,
4434 "utf7", errmsg,
4435 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 }
4439
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 /* end of string */
4441
4442 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4443 /* if we're in an inconsistent state, that's an error */
4444 if (surrogate ||
4445 (base64bits >= 6) ||
4446 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 errors, &errorHandler,
4450 "utf7", "unterminated shift sequence",
4451 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004452 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 goto onError;
4454 if (s < e)
4455 goto restart;
4456 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458
4459 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004462 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004463 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004464 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004465 writer.kind, writer.data, shiftOutStart);
4466 Py_XDECREF(errorHandler);
4467 Py_XDECREF(exc);
4468 _PyUnicodeWriter_Dealloc(&writer);
4469 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004470 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004471 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 }
4473 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004474 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004476 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 Py_XDECREF(errorHandler);
4479 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004480 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 Py_XDECREF(errorHandler);
4484 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004485 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486 return NULL;
4487}
4488
4489
Alexander Belopolsky40018472011-02-26 01:02:56 +00004490PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004491_PyUnicode_EncodeUTF7(PyObject *str,
4492 int base64SetO,
4493 int base64WhiteSpace,
4494 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004495{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004496 int kind;
4497 void *data;
4498 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004499 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004501 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 unsigned int base64bits = 0;
4503 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 char * out;
4505 char * start;
4506
Benjamin Petersonbac79492012-01-14 13:34:47 -05004507 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004508 return NULL;
4509 kind = PyUnicode_KIND(str);
4510 data = PyUnicode_DATA(str);
4511 len = PyUnicode_GET_LENGTH(str);
4512
4513 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004516 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004517 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004518 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004519 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 if (v == NULL)
4521 return NULL;
4522
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004523 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004524 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004525 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527 if (inShift) {
4528 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4529 /* shifting out */
4530 if (base64bits) { /* output remaining bits */
4531 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4532 base64buffer = 0;
4533 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534 }
4535 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 /* Characters not in the BASE64 set implicitly unshift the sequence
4537 so no '-' is required, except if the character is itself a '-' */
4538 if (IS_BASE64(ch) || ch == '-') {
4539 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541 *out++ = (char) ch;
4542 }
4543 else {
4544 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004545 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 else { /* not in a shift sequence */
4548 if (ch == '+') {
4549 *out++ = '+';
4550 *out++ = '-';
4551 }
4552 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4553 *out++ = (char) ch;
4554 }
4555 else {
4556 *out++ = '+';
4557 inShift = 1;
4558 goto encode_char;
4559 }
4560 }
4561 continue;
4562encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004564 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004565
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 /* code first surrogate */
4567 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004568 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 while (base64bits >= 6) {
4570 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4571 base64bits -= 6;
4572 }
4573 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004574 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 base64bits += 16;
4577 base64buffer = (base64buffer << 16) | ch;
4578 while (base64bits >= 6) {
4579 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4580 base64bits -= 6;
4581 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004582 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 if (base64bits)
4584 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4585 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004587 if (_PyBytes_Resize(&v, out - start) < 0)
4588 return NULL;
4589 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004591PyObject *
4592PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4593 Py_ssize_t size,
4594 int base64SetO,
4595 int base64WhiteSpace,
4596 const char *errors)
4597{
4598 PyObject *result;
4599 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4600 if (tmp == NULL)
4601 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004602 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004603 base64WhiteSpace, errors);
4604 Py_DECREF(tmp);
4605 return result;
4606}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004607
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608#undef IS_BASE64
4609#undef FROM_BASE64
4610#undef TO_BASE64
4611#undef DECODE_DIRECT
4612#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614/* --- UTF-8 Codec -------------------------------------------------------- */
4615
Alexander Belopolsky40018472011-02-26 01:02:56 +00004616PyObject *
4617PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004618 Py_ssize_t size,
4619 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620{
Walter Dörwald69652032004-09-07 20:24:22 +00004621 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4622}
4623
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004624#include "stringlib/asciilib.h"
4625#include "stringlib/codecs.h"
4626#include "stringlib/undef.h"
4627
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004628#include "stringlib/ucs1lib.h"
4629#include "stringlib/codecs.h"
4630#include "stringlib/undef.h"
4631
4632#include "stringlib/ucs2lib.h"
4633#include "stringlib/codecs.h"
4634#include "stringlib/undef.h"
4635
4636#include "stringlib/ucs4lib.h"
4637#include "stringlib/codecs.h"
4638#include "stringlib/undef.h"
4639
Antoine Pitrouab868312009-01-10 15:40:25 +00004640/* Mask to quickly check whether a C 'long' contains a
4641 non-ASCII, UTF8-encoded char. */
4642#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004643# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004644#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004645# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004646#else
4647# error C 'long' size should be either 4 or 8!
4648#endif
4649
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004650static Py_ssize_t
4651ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004652{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004653 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004654 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004655
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004656 /*
4657 * Issue #17237: m68k is a bit different from most architectures in
4658 * that objects do not use "natural alignment" - for example, int and
4659 * long are only aligned at 2-byte boundaries. Therefore the assert()
4660 * won't work; also, tests have shown that skipping the "optimised
4661 * version" will even speed up m68k.
4662 */
4663#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004665 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4666 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 /* Fast path, see in STRINGLIB(utf8_decode) for
4668 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004669 /* Help allocation */
4670 const char *_p = p;
4671 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004672 while (_p < aligned_end) {
4673 unsigned long value = *(const unsigned long *) _p;
4674 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004676 *((unsigned long *)q) = value;
4677 _p += SIZEOF_LONG;
4678 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004679 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680 p = _p;
4681 while (p < end) {
4682 if ((unsigned char)*p & 0x80)
4683 break;
4684 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004689#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004690 while (p < end) {
4691 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4692 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004693 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004694 /* Help allocation */
4695 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004696 while (_p < aligned_end) {
4697 unsigned long value = *(unsigned long *) _p;
4698 if (value & ASCII_CHAR_MASK)
4699 break;
4700 _p += SIZEOF_LONG;
4701 }
4702 p = _p;
4703 if (_p == end)
4704 break;
4705 }
4706 if ((unsigned char)*p & 0x80)
4707 break;
4708 ++p;
4709 }
4710 memcpy(dest, start, p - start);
4711 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712}
Antoine Pitrouab868312009-01-10 15:40:25 +00004713
Victor Stinner785938e2011-12-11 20:09:03 +01004714PyObject *
4715PyUnicode_DecodeUTF8Stateful(const char *s,
4716 Py_ssize_t size,
4717 const char *errors,
4718 Py_ssize_t *consumed)
4719{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004720 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004721 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723
4724 Py_ssize_t startinpos;
4725 Py_ssize_t endinpos;
4726 const char *errmsg = "";
4727 PyObject *errorHandler = NULL;
4728 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004729
4730 if (size == 0) {
4731 if (consumed)
4732 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004733 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004734 }
4735
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4737 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004738 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 *consumed = 1;
4740 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004741 }
4742
Victor Stinner8f674cc2013-04-17 23:02:17 +02004743 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004744 writer.min_length = size;
4745 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004747
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004748 writer.pos = ascii_decode(s, end, writer.data);
4749 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004750 while (s < end) {
4751 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004752 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004754 if (PyUnicode_IS_ASCII(writer.buffer))
4755 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004757 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004759 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 } else {
4761 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004762 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004763 }
4764
4765 switch (ch) {
4766 case 0:
4767 if (s == end || consumed)
4768 goto End;
4769 errmsg = "unexpected end of data";
4770 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004771 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772 break;
4773 case 1:
4774 errmsg = "invalid start byte";
4775 startinpos = s - starts;
4776 endinpos = startinpos + 1;
4777 break;
4778 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004779 case 3:
4780 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004781 errmsg = "invalid continuation byte";
4782 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004783 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 break;
4785 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004786 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787 goto onError;
4788 continue;
4789 }
4790
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004791 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004792 errors, &errorHandler,
4793 "utf-8", errmsg,
4794 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004795 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004796 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004797 }
4798
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004799End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004800 if (consumed)
4801 *consumed = s - starts;
4802
4803 Py_XDECREF(errorHandler);
4804 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004805 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004806
4807onError:
4808 Py_XDECREF(errorHandler);
4809 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004810 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004812}
4813
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004814#ifdef __APPLE__
4815
4816/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004817 used to decode the command line arguments on Mac OS X.
4818
4819 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004820 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821
4822wchar_t*
4823_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4824{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004825 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 wchar_t *unicode;
4827 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004828
4829 /* Note: size will always be longer than the resulting Unicode
4830 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004831 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004833 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004834 if (!unicode)
4835 return NULL;
4836
4837 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004842#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004846#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 if (ch > 0xFF) {
4848#if SIZEOF_WCHAR_T == 4
4849 assert(0);
4850#else
4851 assert(Py_UNICODE_IS_SURROGATE(ch));
4852 /* compute and append the two surrogates: */
4853 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4854 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4855#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004856 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 else {
4858 if (!ch && s == e)
4859 break;
4860 /* surrogateescape */
4861 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4862 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004865 return unicode;
4866}
4867
4868#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004870/* Primary internal function which creates utf8 encoded bytes objects.
4871
4872 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004873 and allocate exactly as much space needed at the end. Else allocate the
4874 maximum possible needed (4 result bytes per Unicode character), and return
4875 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004876*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004877PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004878_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879{
Victor Stinner6099a032011-12-18 14:22:26 +01004880 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004881 void *data;
4882 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004884 if (!PyUnicode_Check(unicode)) {
4885 PyErr_BadArgument();
4886 return NULL;
4887 }
4888
4889 if (PyUnicode_READY(unicode) == -1)
4890 return NULL;
4891
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004892 if (PyUnicode_UTF8(unicode))
4893 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4894 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004895
4896 kind = PyUnicode_KIND(unicode);
4897 data = PyUnicode_DATA(unicode);
4898 size = PyUnicode_GET_LENGTH(unicode);
4899
Benjamin Petersonead6b532011-12-20 17:23:42 -06004900 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004901 default:
4902 assert(0);
4903 case PyUnicode_1BYTE_KIND:
4904 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4905 assert(!PyUnicode_IS_ASCII(unicode));
4906 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4907 case PyUnicode_2BYTE_KIND:
4908 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4909 case PyUnicode_4BYTE_KIND:
4910 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912}
4913
Alexander Belopolsky40018472011-02-26 01:02:56 +00004914PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004915PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4916 Py_ssize_t size,
4917 const char *errors)
4918{
4919 PyObject *v, *unicode;
4920
4921 unicode = PyUnicode_FromUnicode(s, size);
4922 if (unicode == NULL)
4923 return NULL;
4924 v = _PyUnicode_AsUTF8String(unicode, errors);
4925 Py_DECREF(unicode);
4926 return v;
4927}
4928
4929PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004930PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004932 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933}
4934
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935/* --- UTF-32 Codec ------------------------------------------------------- */
4936
4937PyObject *
4938PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004939 Py_ssize_t size,
4940 const char *errors,
4941 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942{
4943 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4944}
4945
4946PyObject *
4947PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 Py_ssize_t size,
4949 const char *errors,
4950 int *byteorder,
4951 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952{
4953 const char *starts = s;
4954 Py_ssize_t startinpos;
4955 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004956 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004957 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004958 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004959 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004960 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004961 PyObject *errorHandler = NULL;
4962 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004963
Walter Dörwald41980ca2007-08-16 21:55:45 +00004964 q = (unsigned char *)s;
4965 e = q + size;
4966
4967 if (byteorder)
4968 bo = *byteorder;
4969
4970 /* Check for BOM marks (U+FEFF) in the input and adjust current
4971 byte order setting accordingly. In native mode, the leading BOM
4972 mark is skipped, in all other modes, it is copied to the output
4973 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004974 if (bo == 0 && size >= 4) {
4975 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4976 if (bom == 0x0000FEFF) {
4977 bo = -1;
4978 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004980 else if (bom == 0xFFFE0000) {
4981 bo = 1;
4982 q += 4;
4983 }
4984 if (byteorder)
4985 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986 }
4987
Victor Stinnere64322e2012-10-30 23:12:47 +01004988 if (q == e) {
4989 if (consumed)
4990 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004991 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004992 }
4993
Victor Stinnere64322e2012-10-30 23:12:47 +01004994#ifdef WORDS_BIGENDIAN
4995 le = bo < 0;
4996#else
4997 le = bo <= 0;
4998#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004999 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005000
Victor Stinner8f674cc2013-04-17 23:02:17 +02005001 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005002 writer.min_length = (e - q + 3) / 4;
5003 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005004 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005005
Victor Stinnere64322e2012-10-30 23:12:47 +01005006 while (1) {
5007 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005008 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005009
Victor Stinnere64322e2012-10-30 23:12:47 +01005010 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 enum PyUnicode_Kind kind = writer.kind;
5012 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005014 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005015 if (le) {
5016 do {
5017 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5018 if (ch > maxch)
5019 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005020 if (kind != PyUnicode_1BYTE_KIND &&
5021 Py_UNICODE_IS_SURROGATE(ch))
5022 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005023 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005024 q += 4;
5025 } while (q <= last);
5026 }
5027 else {
5028 do {
5029 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5030 if (ch > maxch)
5031 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005032 if (kind != PyUnicode_1BYTE_KIND &&
5033 Py_UNICODE_IS_SURROGATE(ch))
5034 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005035 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005036 q += 4;
5037 } while (q <= last);
5038 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005039 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005040 }
5041
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005042 if (Py_UNICODE_IS_SURROGATE(ch)) {
5043 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5044 startinpos = ((const char *)q) - starts;
5045 endinpos = startinpos + 4;
5046 }
5047 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005048 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005050 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005052 startinpos = ((const char *)q) - starts;
5053 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005055 else {
5056 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005057 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005058 goto onError;
5059 q += 4;
5060 continue;
5061 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005063 startinpos = ((const char *)q) - starts;
5064 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005066
5067 /* The remaining input chars are ignored if the callback
5068 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005069 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005071 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005073 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 }
5076
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 Py_XDECREF(errorHandler);
5081 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005082 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005085 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086 Py_XDECREF(errorHandler);
5087 Py_XDECREF(exc);
5088 return NULL;
5089}
5090
5091PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005092_PyUnicode_EncodeUTF32(PyObject *str,
5093 const char *errors,
5094 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005096 int kind;
5097 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005098 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005099 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005100 unsigned char *p;
5101 Py_ssize_t nsize, i;
5102 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005103#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005104 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005106 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005108 const char *encoding;
5109 PyObject *errorHandler = NULL;
5110 PyObject *exc = NULL;
5111 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112
Serhiy Storchaka30793282014-01-04 22:44:01 +02005113#define STORECHAR(CH) \
5114 do { \
5115 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5116 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5117 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5118 p[iorder[0]] = (CH) & 0xff; \
5119 p += 4; \
5120 } while(0)
5121
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005122 if (!PyUnicode_Check(str)) {
5123 PyErr_BadArgument();
5124 return NULL;
5125 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005126 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005127 return NULL;
5128 kind = PyUnicode_KIND(str);
5129 data = PyUnicode_DATA(str);
5130 len = PyUnicode_GET_LENGTH(str);
5131
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005132 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005133 if (nsize > PY_SSIZE_T_MAX / 4)
5134 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005135 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005136 if (v == NULL)
5137 return NULL;
5138
Serhiy Storchaka30793282014-01-04 22:44:01 +02005139 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005140 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005141 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005142 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005143 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005144
Serhiy Storchaka30793282014-01-04 22:44:01 +02005145 if (byteorder == -1) {
5146 /* force LE */
5147 iorder[0] = 0;
5148 iorder[1] = 1;
5149 iorder[2] = 2;
5150 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005151 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005152 }
5153 else if (byteorder == 1) {
5154 /* force BE */
5155 iorder[0] = 3;
5156 iorder[1] = 2;
5157 iorder[2] = 1;
5158 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005159 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005160 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005161 else
5162 encoding = "utf-32";
5163
5164 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005165 for (i = 0; i < len; i++)
5166 STORECHAR(PyUnicode_READ(kind, data, i));
5167 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005168 }
5169
Serhiy Storchaka30793282014-01-04 22:44:01 +02005170 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005171 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005172 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5173 i++;
5174 assert(ch <= MAX_UNICODE);
5175 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5176 STORECHAR(ch);
5177 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005178 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005179
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005180 rep = unicode_encode_call_errorhandler(
5181 errors, &errorHandler,
5182 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005183 str, &exc, i-1, i, &i);
5184
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005185 if (!rep)
5186 goto error;
5187
5188 if (PyBytes_Check(rep)) {
5189 repsize = PyBytes_GET_SIZE(rep);
5190 if (repsize & 3) {
5191 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005192 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005193 "surrogates not allowed");
5194 goto error;
5195 }
5196 moreunits = repsize / 4;
5197 }
5198 else {
5199 assert(PyUnicode_Check(rep));
5200 if (PyUnicode_READY(rep) < 0)
5201 goto error;
5202 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5203 if (!PyUnicode_IS_ASCII(rep)) {
5204 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005205 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005206 "surrogates not allowed");
5207 goto error;
5208 }
5209 }
5210
5211 /* four bytes are reserved for each surrogate */
5212 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005213 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005214 Py_ssize_t morebytes = 4 * (moreunits - 1);
5215 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5216 /* integer overflow */
5217 PyErr_NoMemory();
5218 goto error;
5219 }
5220 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5221 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005222 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005223 }
5224
5225 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005226 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5227 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005228 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005229 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005230 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005231 repdata = PyUnicode_1BYTE_DATA(rep);
5232 while (repsize--) {
5233 Py_UCS4 ch = *repdata++;
5234 STORECHAR(ch);
5235 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005236 }
5237
5238 Py_CLEAR(rep);
5239 }
5240
5241 /* Cut back to size actually needed. This is necessary for, for example,
5242 encoding of a string containing isolated surrogates and the 'ignore'
5243 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005244 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005245 if (nsize != PyBytes_GET_SIZE(v))
5246 _PyBytes_Resize(&v, nsize);
5247 Py_XDECREF(errorHandler);
5248 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005249 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005250 error:
5251 Py_XDECREF(rep);
5252 Py_XDECREF(errorHandler);
5253 Py_XDECREF(exc);
5254 Py_XDECREF(v);
5255 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005256#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005257}
5258
Alexander Belopolsky40018472011-02-26 01:02:56 +00005259PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005260PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5261 Py_ssize_t size,
5262 const char *errors,
5263 int byteorder)
5264{
5265 PyObject *result;
5266 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5267 if (tmp == NULL)
5268 return NULL;
5269 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5270 Py_DECREF(tmp);
5271 return result;
5272}
5273
5274PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005275PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276{
Victor Stinnerb960b342011-11-20 19:12:52 +01005277 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005278}
5279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280/* --- UTF-16 Codec ------------------------------------------------------- */
5281
Tim Peters772747b2001-08-09 22:21:55 +00005282PyObject *
5283PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 Py_ssize_t size,
5285 const char *errors,
5286 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287{
Walter Dörwald69652032004-09-07 20:24:22 +00005288 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5289}
5290
5291PyObject *
5292PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 Py_ssize_t size,
5294 const char *errors,
5295 int *byteorder,
5296 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005297{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005299 Py_ssize_t startinpos;
5300 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005301 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005302 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005303 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005304 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005305 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005306 PyObject *errorHandler = NULL;
5307 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005308 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309
Tim Peters772747b2001-08-09 22:21:55 +00005310 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005311 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312
5313 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005314 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005316 /* Check for BOM marks (U+FEFF) in the input and adjust current
5317 byte order setting accordingly. In native mode, the leading BOM
5318 mark is skipped, in all other modes, it is copied to the output
5319 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005320 if (bo == 0 && size >= 2) {
5321 const Py_UCS4 bom = (q[1] << 8) | q[0];
5322 if (bom == 0xFEFF) {
5323 q += 2;
5324 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005326 else if (bom == 0xFFFE) {
5327 q += 2;
5328 bo = 1;
5329 }
5330 if (byteorder)
5331 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005332 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333
Antoine Pitrou63065d72012-05-15 23:48:04 +02005334 if (q == e) {
5335 if (consumed)
5336 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005337 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005338 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005339
Christian Heimes743e0cd2012-10-17 23:52:17 +02005340#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005341 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005342 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005343#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005344 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005345 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005346#endif
Tim Peters772747b2001-08-09 22:21:55 +00005347
Antoine Pitrou63065d72012-05-15 23:48:04 +02005348 /* Note: size will always be longer than the resulting Unicode
5349 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005350 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005351 writer.min_length = (e - q + 1) / 2;
5352 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005353 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005354
Antoine Pitrou63065d72012-05-15 23:48:04 +02005355 while (1) {
5356 Py_UCS4 ch = 0;
5357 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005359 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005361 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005362 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005363 native_ordering);
5364 else
5365 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005366 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005367 native_ordering);
5368 } else if (kind == PyUnicode_2BYTE_KIND) {
5369 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005370 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005371 native_ordering);
5372 } else {
5373 assert(kind == PyUnicode_4BYTE_KIND);
5374 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005375 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005376 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005377 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005379
Antoine Pitrou63065d72012-05-15 23:48:04 +02005380 switch (ch)
5381 {
5382 case 0:
5383 /* remaining byte at the end? (size should be even) */
5384 if (q == e || consumed)
5385 goto End;
5386 errmsg = "truncated data";
5387 startinpos = ((const char *)q) - starts;
5388 endinpos = ((const char *)e) - starts;
5389 break;
5390 /* The remaining input chars are ignored if the callback
5391 chooses to skip the input */
5392 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005393 q -= 2;
5394 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005395 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005396 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005397 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005398 endinpos = ((const char *)e) - starts;
5399 break;
5400 case 2:
5401 errmsg = "illegal encoding";
5402 startinpos = ((const char *)q) - 2 - starts;
5403 endinpos = startinpos + 2;
5404 break;
5405 case 3:
5406 errmsg = "illegal UTF-16 surrogate";
5407 startinpos = ((const char *)q) - 4 - starts;
5408 endinpos = startinpos + 2;
5409 break;
5410 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005411 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005412 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 continue;
5414 }
5415
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005416 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005417 errors,
5418 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005419 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005420 &starts,
5421 (const char **)&e,
5422 &startinpos,
5423 &endinpos,
5424 &exc,
5425 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005426 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 }
5429
Antoine Pitrou63065d72012-05-15 23:48:04 +02005430End:
Walter Dörwald69652032004-09-07 20:24:22 +00005431 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005433
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 Py_XDECREF(errorHandler);
5435 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005436 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005439 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440 Py_XDECREF(errorHandler);
5441 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 return NULL;
5443}
5444
Tim Peters772747b2001-08-09 22:21:55 +00005445PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005446_PyUnicode_EncodeUTF16(PyObject *str,
5447 const char *errors,
5448 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005450 enum PyUnicode_Kind kind;
5451 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005452 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005453 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005454 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005455 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005456#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005457 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005458#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005459 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005460#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005461 const char *encoding;
5462 Py_ssize_t nsize, pos;
5463 PyObject *errorHandler = NULL;
5464 PyObject *exc = NULL;
5465 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005466
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005467 if (!PyUnicode_Check(str)) {
5468 PyErr_BadArgument();
5469 return NULL;
5470 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005471 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005472 return NULL;
5473 kind = PyUnicode_KIND(str);
5474 data = PyUnicode_DATA(str);
5475 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005476
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005477 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005478 if (kind == PyUnicode_4BYTE_KIND) {
5479 const Py_UCS4 *in = (const Py_UCS4 *)data;
5480 const Py_UCS4 *end = in + len;
5481 while (in < end)
5482 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005483 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005484 }
5485 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005487 nsize = len + pairs + (byteorder == 0);
5488 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 if (v == NULL)
5490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005492 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005493 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005494 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005496 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005497 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005498 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005499
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005500 if (kind == PyUnicode_1BYTE_KIND) {
5501 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5502 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005503 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005504
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005505 if (byteorder < 0)
5506 encoding = "utf-16-le";
5507 else if (byteorder > 0)
5508 encoding = "utf-16-be";
5509 else
5510 encoding = "utf-16";
5511
5512 pos = 0;
5513 while (pos < len) {
5514 Py_ssize_t repsize, moreunits;
5515
5516 if (kind == PyUnicode_2BYTE_KIND) {
5517 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5518 &out, native_ordering);
5519 }
5520 else {
5521 assert(kind == PyUnicode_4BYTE_KIND);
5522 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5523 &out, native_ordering);
5524 }
5525 if (pos == len)
5526 break;
5527
5528 rep = unicode_encode_call_errorhandler(
5529 errors, &errorHandler,
5530 encoding, "surrogates not allowed",
5531 str, &exc, pos, pos + 1, &pos);
5532 if (!rep)
5533 goto error;
5534
5535 if (PyBytes_Check(rep)) {
5536 repsize = PyBytes_GET_SIZE(rep);
5537 if (repsize & 1) {
5538 raise_encode_exception(&exc, encoding,
5539 str, pos - 1, pos,
5540 "surrogates not allowed");
5541 goto error;
5542 }
5543 moreunits = repsize / 2;
5544 }
5545 else {
5546 assert(PyUnicode_Check(rep));
5547 if (PyUnicode_READY(rep) < 0)
5548 goto error;
5549 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5550 if (!PyUnicode_IS_ASCII(rep)) {
5551 raise_encode_exception(&exc, encoding,
5552 str, pos - 1, pos,
5553 "surrogates not allowed");
5554 goto error;
5555 }
5556 }
5557
5558 /* two bytes are reserved for each surrogate */
5559 if (moreunits > 1) {
5560 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5561 Py_ssize_t morebytes = 2 * (moreunits - 1);
5562 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5563 /* integer overflow */
5564 PyErr_NoMemory();
5565 goto error;
5566 }
5567 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5568 goto error;
5569 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5570 }
5571
5572 if (PyBytes_Check(rep)) {
5573 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5574 out += moreunits;
5575 } else /* rep is unicode */ {
5576 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5577 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5578 &out, native_ordering);
5579 }
5580
5581 Py_CLEAR(rep);
5582 }
5583
5584 /* Cut back to size actually needed. This is necessary for, for example,
5585 encoding of a string containing isolated surrogates and the 'ignore' handler
5586 is used. */
5587 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5588 if (nsize != PyBytes_GET_SIZE(v))
5589 _PyBytes_Resize(&v, nsize);
5590 Py_XDECREF(errorHandler);
5591 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005592 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005593 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005594 error:
5595 Py_XDECREF(rep);
5596 Py_XDECREF(errorHandler);
5597 Py_XDECREF(exc);
5598 Py_XDECREF(v);
5599 return NULL;
5600#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601}
5602
Alexander Belopolsky40018472011-02-26 01:02:56 +00005603PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005604PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5605 Py_ssize_t size,
5606 const char *errors,
5607 int byteorder)
5608{
5609 PyObject *result;
5610 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5611 if (tmp == NULL)
5612 return NULL;
5613 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5614 Py_DECREF(tmp);
5615 return result;
5616}
5617
5618PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005619PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005621 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622}
5623
5624/* --- Unicode Escape Codec ----------------------------------------------- */
5625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005626/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5627 if all the escapes in the string make it still a valid ASCII string.
5628 Returns -1 if any escapes were found which cause the string to
5629 pop out of ASCII range. Otherwise returns the length of the
5630 required buffer to hold the string.
5631 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005632static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005633length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5634{
5635 const unsigned char *p = (const unsigned char *)s;
5636 const unsigned char *end = p + size;
5637 Py_ssize_t length = 0;
5638
5639 if (size < 0)
5640 return -1;
5641
5642 for (; p < end; ++p) {
5643 if (*p > 127) {
5644 /* Non-ASCII */
5645 return -1;
5646 }
5647 else if (*p != '\\') {
5648 /* Normal character */
5649 ++length;
5650 }
5651 else {
5652 /* Backslash-escape, check next char */
5653 ++p;
5654 /* Escape sequence reaches till end of string or
5655 non-ASCII follow-up. */
5656 if (p >= end || *p > 127)
5657 return -1;
5658 switch (*p) {
5659 case '\n':
5660 /* backslash + \n result in zero characters */
5661 break;
5662 case '\\': case '\'': case '\"':
5663 case 'b': case 'f': case 't':
5664 case 'n': case 'r': case 'v': case 'a':
5665 ++length;
5666 break;
5667 case '0': case '1': case '2': case '3':
5668 case '4': case '5': case '6': case '7':
5669 case 'x': case 'u': case 'U': case 'N':
5670 /* these do not guarantee ASCII characters */
5671 return -1;
5672 default:
5673 /* count the backslash + the other character */
5674 length += 2;
5675 }
5676 }
5677 }
5678 return length;
5679}
5680
Fredrik Lundh06d12682001-01-24 07:59:11 +00005681static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005682
Alexander Belopolsky40018472011-02-26 01:02:56 +00005683PyObject *
5684PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005685 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005689 Py_ssize_t startinpos;
5690 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005693 char* message;
5694 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 PyObject *errorHandler = NULL;
5696 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005697 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005698
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005699 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005700 if (len == 0)
5701 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005702
5703 /* After length_of_escaped_ascii_string() there are two alternatives,
5704 either the string is pure ASCII with named escapes like \n, etc.
5705 and we determined it's exact size (common case)
5706 or it contains \x, \u, ... escape sequences. then we create a
5707 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005708 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005709 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005710 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711 }
5712 else {
5713 /* Escaped strings will always be longer than the resulting
5714 Unicode string, so we start with size here and then reduce the
5715 length after conversion to the true value.
5716 (but if the error callback returns a long replacement string
5717 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005718 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005719 }
5720
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005722 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005724
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 while (s < end) {
5726 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005727 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
5730 /* Non-escape characters are interpreted as Unicode ordinals */
5731 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005732 x = (unsigned char)*s;
5733 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005734 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005735 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 continue;
5737 }
5738
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 /* \ - Escapes */
5741 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005742 c = *s++;
5743 if (s > end)
5744 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005746 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005749#define WRITECHAR(ch) \
5750 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005751 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005752 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005753 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005754
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005756 case '\\': WRITECHAR('\\'); break;
5757 case '\'': WRITECHAR('\''); break;
5758 case '\"': WRITECHAR('\"'); break;
5759 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005760 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005761 case 'f': WRITECHAR('\014'); break;
5762 case 't': WRITECHAR('\t'); break;
5763 case 'n': WRITECHAR('\n'); break;
5764 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005765 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005767 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005768 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 case '0': case '1': case '2': case '3':
5772 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005773 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005774 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005775 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005776 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005777 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005779 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 break;
5781
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 /* hex escapes */
5783 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005785 digits = 2;
5786 message = "truncated \\xXX escape";
5787 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005791 digits = 4;
5792 message = "truncated \\uXXXX escape";
5793 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005796 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005797 digits = 8;
5798 message = "truncated \\UXXXXXXXX escape";
5799 hexescape:
5800 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005801 if (end - s < digits) {
5802 /* count only hex digits */
5803 for (; s < end; ++s) {
5804 c = (unsigned char)*s;
5805 if (!Py_ISXDIGIT(c))
5806 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005807 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005808 goto error;
5809 }
5810 for (; digits--; ++s) {
5811 c = (unsigned char)*s;
5812 if (!Py_ISXDIGIT(c))
5813 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005814 chr = (chr<<4) & ~0xF;
5815 if (c >= '0' && c <= '9')
5816 chr += c - '0';
5817 else if (c >= 'a' && c <= 'f')
5818 chr += 10 + c - 'a';
5819 else
5820 chr += 10 + c - 'A';
5821 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005822 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 /* _decoding_error will have already written into the
5824 target buffer. */
5825 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005826 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005827 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005828 message = "illegal Unicode character";
5829 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005830 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005831 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005832 break;
5833
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005835 case 'N':
5836 message = "malformed \\N character escape";
5837 if (ucnhash_CAPI == NULL) {
5838 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005839 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5840 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005841 if (ucnhash_CAPI == NULL)
5842 goto ucnhashError;
5843 }
5844 if (*s == '{') {
5845 const char *start = s+1;
5846 /* look for the closing brace */
5847 while (*s != '}' && s < end)
5848 s++;
5849 if (s > start && s < end && *s == '}') {
5850 /* found a name. look it up in the unicode database */
5851 message = "unknown Unicode character name";
5852 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005853 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005854 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005855 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005856 goto store;
5857 }
5858 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005859 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005860
5861 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005862 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 message = "\\ at end of string";
5864 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005865 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005866 }
5867 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005868 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005869 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005870 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005871 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005873 continue;
5874
5875 error:
5876 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005877 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005878 errors, &errorHandler,
5879 "unicodeescape", message,
5880 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005881 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005882 goto onError;
5883 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005885#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005886
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005887 Py_XDECREF(errorHandler);
5888 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005889 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005890
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005892 PyErr_SetString(
5893 PyExc_UnicodeError,
5894 "\\N escapes not supported (can't load unicodedata module)"
5895 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 Py_XDECREF(errorHandler);
5898 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005899 return NULL;
5900
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005902 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 Py_XDECREF(errorHandler);
5904 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 return NULL;
5906}
5907
5908/* Return a Unicode-Escape string version of the Unicode object.
5909
5910 If quotes is true, the string is enclosed in u"" or u'' quotes as
5911 appropriate.
5912
5913*/
5914
Alexander Belopolsky40018472011-02-26 01:02:56 +00005915PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005918 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005919 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005921 int kind;
5922 void *data;
5923 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924
Ezio Melottie7f90372012-10-05 03:33:31 +03005925 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005926 escape.
5927
Ezio Melottie7f90372012-10-05 03:33:31 +03005928 For UCS1 strings it's '\xxx', 4 bytes per source character.
5929 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5930 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005931 */
5932
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005933 if (!PyUnicode_Check(unicode)) {
5934 PyErr_BadArgument();
5935 return NULL;
5936 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005937 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005938 return NULL;
5939 len = PyUnicode_GET_LENGTH(unicode);
5940 kind = PyUnicode_KIND(unicode);
5941 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005942 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005943 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5944 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5945 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5946 }
5947
5948 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005949 return PyBytes_FromStringAndSize(NULL, 0);
5950
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005951 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005953
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005954 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 if (repr == NULL)
5959 return NULL;
5960
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005961 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005963 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005964 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005965
Walter Dörwald79e913e2007-05-12 11:08:06 +00005966 /* Escape backslashes */
5967 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 *p++ = '\\';
5969 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005970 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005972
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005973 /* Map 21-bit characters to '\U00xxxxxx' */
5974 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005975 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005976 *p++ = '\\';
5977 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005978 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5982 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5983 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5984 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5985 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005987 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005988
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005990 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 *p++ = '\\';
5992 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005993 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5994 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5995 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5996 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005998
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005999 /* Map special whitespace to '\t', \n', '\r' */
6000 else if (ch == '\t') {
6001 *p++ = '\\';
6002 *p++ = 't';
6003 }
6004 else if (ch == '\n') {
6005 *p++ = '\\';
6006 *p++ = 'n';
6007 }
6008 else if (ch == '\r') {
6009 *p++ = '\\';
6010 *p++ = 'r';
6011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006012
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006013 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006014 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006016 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006017 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6018 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006019 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006020
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 /* Copy everything else as-is */
6022 else
6023 *p++ = (char) ch;
6024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006026 assert(p - PyBytes_AS_STRING(repr) > 0);
6027 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6028 return NULL;
6029 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030}
6031
Alexander Belopolsky40018472011-02-26 01:02:56 +00006032PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6034 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006036 PyObject *result;
6037 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6038 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040 result = PyUnicode_AsUnicodeEscapeString(tmp);
6041 Py_DECREF(tmp);
6042 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043}
6044
6045/* --- Raw Unicode Escape Codec ------------------------------------------- */
6046
Alexander Belopolsky40018472011-02-26 01:02:56 +00006047PyObject *
6048PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006049 Py_ssize_t size,
6050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006053 Py_ssize_t startinpos;
6054 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006055 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 const char *end;
6057 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 PyObject *errorHandler = NULL;
6059 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006060
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006061 if (size == 0)
6062 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 /* Escaped strings will always be longer than the resulting
6065 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 length after conversion to the true value. (But decoding error
6067 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006068 _PyUnicodeWriter_Init(&writer);
6069 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006070
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 end = s + size;
6072 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 unsigned char c;
6074 Py_UCS4 x;
6075 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006076 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 /* Non-escape characters are interpreted as Unicode ordinals */
6079 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006080 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006081 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006082 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006084 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 startinpos = s-starts;
6086
6087 /* \u-escapes are only interpreted iff the number of leading
6088 backslashes if odd */
6089 bs = s;
6090 for (;s < end;) {
6091 if (*s != '\\')
6092 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006093 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006094 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006095 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 }
6097 if (((s - bs) & 1) == 0 ||
6098 s >= end ||
6099 (*s != 'u' && *s != 'U')) {
6100 continue;
6101 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006102 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 count = *s=='u' ? 4 : 8;
6104 s++;
6105
6106 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 for (x = 0, i = 0; i < count; ++i, ++s) {
6108 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006109 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006111 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 errors, &errorHandler,
6113 "rawunicodeescape", "truncated \\uXXXX",
6114 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006115 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 goto onError;
6117 goto nextByte;
6118 }
6119 x = (x<<4) & ~0xF;
6120 if (c >= '0' && c <= '9')
6121 x += c - '0';
6122 else if (c >= 'a' && c <= 'f')
6123 x += 10 + c - 'a';
6124 else
6125 x += 10 + c - 'A';
6126 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006127 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006128 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006129 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006130 }
6131 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006132 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006133 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006134 errors, &errorHandler,
6135 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006137 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006139 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 nextByte:
6141 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 Py_XDECREF(errorHandler);
6144 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006145 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006146
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006148 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149 Py_XDECREF(errorHandler);
6150 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 return NULL;
6152}
6153
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154
Alexander Belopolsky40018472011-02-26 01:02:56 +00006155PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006158 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 char *p;
6160 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006161 Py_ssize_t expandsize, pos;
6162 int kind;
6163 void *data;
6164 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 if (!PyUnicode_Check(unicode)) {
6167 PyErr_BadArgument();
6168 return NULL;
6169 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006170 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 return NULL;
6172 kind = PyUnicode_KIND(unicode);
6173 data = PyUnicode_DATA(unicode);
6174 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006175 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6176 bytes, and 1 byte characters 4. */
6177 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006178
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006181
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 if (repr == NULL)
6184 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006188 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189 for (pos = 0; pos < len; pos++) {
6190 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* Map 32-bit characters to '\Uxxxxxxxx' */
6192 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006193 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006194 *p++ = '\\';
6195 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006196 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6199 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6201 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6202 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6203 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006204 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 *p++ = '\\';
6208 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006209 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6210 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6211 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6212 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* Copy everything else as-is */
6215 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 *p++ = (char) ch;
6217 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006218
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006219 assert(p > q);
6220 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006221 return NULL;
6222 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223}
6224
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6227 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006229 PyObject *result;
6230 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6231 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006232 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006233 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6234 Py_DECREF(tmp);
6235 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236}
6237
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006238/* --- Unicode Internal Codec ------------------------------------------- */
6239
Alexander Belopolsky40018472011-02-26 01:02:56 +00006240PyObject *
6241_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006242 Py_ssize_t size,
6243 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006244{
6245 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006246 Py_ssize_t startinpos;
6247 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006248 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006249 const char *end;
6250 const char *reason;
6251 PyObject *errorHandler = NULL;
6252 PyObject *exc = NULL;
6253
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006255 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006256 1))
6257 return NULL;
6258
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006259 if (size == 0)
6260 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006261
Victor Stinner8f674cc2013-04-17 23:02:17 +02006262 _PyUnicodeWriter_Init(&writer);
6263 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6264 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006266 }
6267 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268
Victor Stinner8f674cc2013-04-17 23:02:17 +02006269 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006270 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006271 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006272 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006273 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006274 endinpos = end-starts;
6275 reason = "truncated input";
6276 goto error;
6277 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006278 /* We copy the raw representation one byte at a time because the
6279 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006280 ((char *) &uch)[0] = s[0];
6281 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006282#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006283 ((char *) &uch)[2] = s[2];
6284 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006285#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006286 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006287#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006288 /* We have to sanity check the raw data, otherwise doom looms for
6289 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006290 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006291 endinpos = s - starts + Py_UNICODE_SIZE;
6292 reason = "illegal code point (> 0x10FFFF)";
6293 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006295#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006296 s += Py_UNICODE_SIZE;
6297#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006298 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006299 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006300 Py_UNICODE uch2;
6301 ((char *) &uch2)[0] = s[0];
6302 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006303 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 {
Victor Stinner551ac952011-11-29 22:58:13 +01006305 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006306 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006307 }
6308 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006309#endif
6310
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006311 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006312 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006313 continue;
6314
6315 error:
6316 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006317 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006318 errors, &errorHandler,
6319 "unicode_internal", reason,
6320 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006321 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006322 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 }
6324
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325 Py_XDECREF(errorHandler);
6326 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006327 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006330 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006331 Py_XDECREF(errorHandler);
6332 Py_XDECREF(exc);
6333 return NULL;
6334}
6335
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336/* --- Latin-1 Codec ------------------------------------------------------ */
6337
Alexander Belopolsky40018472011-02-26 01:02:56 +00006338PyObject *
6339PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006340 Py_ssize_t size,
6341 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006344 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345}
6346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006348static void
6349make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006350 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006351 PyObject *unicode,
6352 Py_ssize_t startpos, Py_ssize_t endpos,
6353 const char *reason)
6354{
6355 if (*exceptionObject == NULL) {
6356 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006357 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006358 encoding, unicode, startpos, endpos, reason);
6359 }
6360 else {
6361 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6362 goto onError;
6363 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6364 goto onError;
6365 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6366 goto onError;
6367 return;
6368 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006369 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006370 }
6371}
6372
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006373/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006374static void
6375raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006376 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006377 PyObject *unicode,
6378 Py_ssize_t startpos, Py_ssize_t endpos,
6379 const char *reason)
6380{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006381 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006382 encoding, unicode, startpos, endpos, reason);
6383 if (*exceptionObject != NULL)
6384 PyCodec_StrictErrors(*exceptionObject);
6385}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006386
6387/* error handling callback helper:
6388 build arguments, call the callback and check the arguments,
6389 put the result into newpos and return the replacement string, which
6390 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006391static PyObject *
6392unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006393 PyObject **errorHandler,
6394 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006395 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006396 Py_ssize_t startpos, Py_ssize_t endpos,
6397 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006399 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006400 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 PyObject *restuple;
6402 PyObject *resunicode;
6403
6404 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408 }
6409
Benjamin Petersonbac79492012-01-14 13:34:47 -05006410 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006411 return NULL;
6412 len = PyUnicode_GET_LENGTH(unicode);
6413
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006414 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006415 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418
6419 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006424 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 Py_DECREF(restuple);
6426 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006428 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 &resunicode, newpos)) {
6430 Py_DECREF(restuple);
6431 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006433 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6434 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6435 Py_DECREF(restuple);
6436 return NULL;
6437 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006439 *newpos = len + *newpos;
6440 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006441 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 Py_DECREF(restuple);
6443 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445 Py_INCREF(resunicode);
6446 Py_DECREF(restuple);
6447 return resunicode;
6448}
6449
Alexander Belopolsky40018472011-02-26 01:02:56 +00006450static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006452 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006453 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006455 /* input state */
6456 Py_ssize_t pos=0, size;
6457 int kind;
6458 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 /* output object */
6460 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 /* pointer into the output */
6462 char *str;
6463 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006464 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006465 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6466 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467 PyObject *errorHandler = NULL;
6468 PyObject *exc = NULL;
6469 /* the following variable is used for caching string comparisons
6470 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6471 int known_errorHandler = -1;
6472
Benjamin Petersonbac79492012-01-14 13:34:47 -05006473 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 return NULL;
6475 size = PyUnicode_GET_LENGTH(unicode);
6476 kind = PyUnicode_KIND(unicode);
6477 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 /* allocate enough for a simple encoding without
6479 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006480 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006481 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006482 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006484 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006485 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006486 ressize = size;
6487
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006488 while (pos < size) {
6489 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 /* can we encode this? */
6492 if (c<limit) {
6493 /* no overflow check, because we know that the space is enough */
6494 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006496 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 Py_ssize_t requiredsize;
6499 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006502 Py_ssize_t collstart = pos;
6503 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 ++collend;
6507 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6508 if (known_errorHandler==-1) {
6509 if ((errors==NULL) || (!strcmp(errors, "strict")))
6510 known_errorHandler = 1;
6511 else if (!strcmp(errors, "replace"))
6512 known_errorHandler = 2;
6513 else if (!strcmp(errors, "ignore"))
6514 known_errorHandler = 3;
6515 else if (!strcmp(errors, "xmlcharrefreplace"))
6516 known_errorHandler = 4;
6517 else
6518 known_errorHandler = 0;
6519 }
6520 switch (known_errorHandler) {
6521 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006522 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 goto onError;
6524 case 2: /* replace */
6525 while (collstart++<collend)
6526 *str++ = '?'; /* fall through */
6527 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 break;
6530 case 4: /* xmlcharrefreplace */
6531 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006532 /* determine replacement size */
6533 for (i = collstart, repsize = 0; i < collend; ++i) {
6534 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6535 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006545 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006547 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006548 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006550 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 if (requiredsize > ressize) {
6554 if (requiredsize<2*ressize)
6555 requiredsize = 2*ressize;
6556 if (_PyBytes_Resize(&res, requiredsize))
6557 goto onError;
6558 str = PyBytes_AS_STRING(res) + respos;
6559 ressize = requiredsize;
6560 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006561 /* generate replacement */
6562 for (i = collstart; i < collend; ++i) {
6563 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006565 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 break;
6567 default:
6568 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569 encoding, reason, unicode, &exc,
6570 collstart, collend, &newpos);
6571 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006572 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006574 if (PyBytes_Check(repunicode)) {
6575 /* Directly copy bytes result to output. */
6576 repsize = PyBytes_Size(repunicode);
6577 if (repsize > 1) {
6578 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006579 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006580 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6581 Py_DECREF(repunicode);
6582 goto onError;
6583 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006584 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006585 ressize += repsize-1;
6586 }
6587 memcpy(str, PyBytes_AsString(repunicode), repsize);
6588 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006589 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006590 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006591 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006592 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 /* need more space? (at least enough for what we
6594 have+the replacement+the rest of the string, so
6595 we won't have to check space for encodable characters) */
6596 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006597 repsize = PyUnicode_GET_LENGTH(repunicode);
6598 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 if (requiredsize > ressize) {
6600 if (requiredsize<2*ressize)
6601 requiredsize = 2*ressize;
6602 if (_PyBytes_Resize(&res, requiredsize)) {
6603 Py_DECREF(repunicode);
6604 goto onError;
6605 }
6606 str = PyBytes_AS_STRING(res) + respos;
6607 ressize = requiredsize;
6608 }
6609 /* check if there is anything unencodable in the replacement
6610 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006611 for (i = 0; repsize-->0; ++i, ++str) {
6612 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006614 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006615 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 Py_DECREF(repunicode);
6617 goto onError;
6618 }
6619 *str = (char)c;
6620 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006621 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006622 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006623 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006624 }
6625 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006626 /* Resize if we allocated to much */
6627 size = str - PyBytes_AS_STRING(res);
6628 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006629 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006630 if (_PyBytes_Resize(&res, size) < 0)
6631 goto onError;
6632 }
6633
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006634 Py_XDECREF(errorHandler);
6635 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006636 return res;
6637
6638 onError:
6639 Py_XDECREF(res);
6640 Py_XDECREF(errorHandler);
6641 Py_XDECREF(exc);
6642 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006643}
6644
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006646PyObject *
6647PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006648 Py_ssize_t size,
6649 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006651 PyObject *result;
6652 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6653 if (unicode == NULL)
6654 return NULL;
6655 result = unicode_encode_ucs1(unicode, errors, 256);
6656 Py_DECREF(unicode);
6657 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658}
6659
Alexander Belopolsky40018472011-02-26 01:02:56 +00006660PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006661_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662{
6663 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 PyErr_BadArgument();
6665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006667 if (PyUnicode_READY(unicode) == -1)
6668 return NULL;
6669 /* Fast path: if it is a one-byte string, construct
6670 bytes object directly. */
6671 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6672 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6673 PyUnicode_GET_LENGTH(unicode));
6674 /* Non-Latin-1 characters present. Defer to above function to
6675 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006676 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006677}
6678
6679PyObject*
6680PyUnicode_AsLatin1String(PyObject *unicode)
6681{
6682 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683}
6684
6685/* --- 7-bit ASCII Codec -------------------------------------------------- */
6686
Alexander Belopolsky40018472011-02-26 01:02:56 +00006687PyObject *
6688PyUnicode_DecodeASCII(const char *s,
6689 Py_ssize_t size,
6690 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006693 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006694 int kind;
6695 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006696 Py_ssize_t startinpos;
6697 Py_ssize_t endinpos;
6698 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 const char *e;
6700 PyObject *errorHandler = NULL;
6701 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006702
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006704 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006705
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006707 if (size == 1 && (unsigned char)s[0] < 128)
6708 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006709
Victor Stinner8f674cc2013-04-17 23:02:17 +02006710 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006711 writer.min_length = size;
6712 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006713 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006717 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006718 writer.pos = outpos;
6719 if (writer.pos == size)
6720 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006721
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006722 s += writer.pos;
6723 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006725 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006727 PyUnicode_WRITE(kind, data, writer.pos, c);
6728 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 ++s;
6730 }
6731 else {
6732 startinpos = s-starts;
6733 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006734 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 errors, &errorHandler,
6736 "ascii", "ordinal not in range(128)",
6737 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006740 kind = writer.kind;
6741 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006744 Py_XDECREF(errorHandler);
6745 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006746 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006747
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006749 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750 Py_XDECREF(errorHandler);
6751 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 return NULL;
6753}
6754
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006755/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006756PyObject *
6757PyUnicode_EncodeASCII(const Py_UNICODE *p,
6758 Py_ssize_t size,
6759 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 PyObject *result;
6762 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6763 if (unicode == NULL)
6764 return NULL;
6765 result = unicode_encode_ucs1(unicode, errors, 128);
6766 Py_DECREF(unicode);
6767 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768}
6769
Alexander Belopolsky40018472011-02-26 01:02:56 +00006770PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006771_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
6773 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 PyErr_BadArgument();
6775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006777 if (PyUnicode_READY(unicode) == -1)
6778 return NULL;
6779 /* Fast path: if it is an ASCII-only string, construct bytes object
6780 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006781 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006782 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6783 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006785}
6786
6787PyObject *
6788PyUnicode_AsASCIIString(PyObject *unicode)
6789{
6790 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791}
6792
Victor Stinner99b95382011-07-04 14:23:54 +02006793#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006795/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006796
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006797#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798#define NEED_RETRY
6799#endif
6800
Victor Stinner3a50e702011-10-18 21:21:00 +02006801#ifndef WC_ERR_INVALID_CHARS
6802# define WC_ERR_INVALID_CHARS 0x0080
6803#endif
6804
6805static char*
6806code_page_name(UINT code_page, PyObject **obj)
6807{
6808 *obj = NULL;
6809 if (code_page == CP_ACP)
6810 return "mbcs";
6811 if (code_page == CP_UTF7)
6812 return "CP_UTF7";
6813 if (code_page == CP_UTF8)
6814 return "CP_UTF8";
6815
6816 *obj = PyBytes_FromFormat("cp%u", code_page);
6817 if (*obj == NULL)
6818 return NULL;
6819 return PyBytes_AS_STRING(*obj);
6820}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006821
Victor Stinner3a50e702011-10-18 21:21:00 +02006822static DWORD
6823decode_code_page_flags(UINT code_page)
6824{
6825 if (code_page == CP_UTF7) {
6826 /* The CP_UTF7 decoder only supports flags=0 */
6827 return 0;
6828 }
6829 else
6830 return MB_ERR_INVALID_CHARS;
6831}
6832
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006833/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006834 * Decode a byte string from a Windows code page into unicode object in strict
6835 * mode.
6836 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006837 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6838 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006839 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006840static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006841decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006842 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006843 const char *in,
6844 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006845{
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006847 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006848 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006849
6850 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006851 assert(insize > 0);
6852 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6853 if (outsize <= 0)
6854 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855
6856 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006858 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006859 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 if (*v == NULL)
6861 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006862 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006863 }
6864 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006866 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006867 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870 }
6871
6872 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006873 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6874 if (outsize <= 0)
6875 goto error;
6876 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006877
Victor Stinner3a50e702011-10-18 21:21:00 +02006878error:
6879 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6880 return -2;
6881 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006882 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006883}
6884
Victor Stinner3a50e702011-10-18 21:21:00 +02006885/*
6886 * Decode a byte string from a code page into unicode object with an error
6887 * handler.
6888 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006889 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006890 * UnicodeDecodeError exception and returns -1 on error.
6891 */
6892static int
6893decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006894 PyObject **v,
6895 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006896 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006897{
6898 const char *startin = in;
6899 const char *endin = in + size;
6900 const DWORD flags = decode_code_page_flags(code_page);
6901 /* Ideally, we should get reason from FormatMessage. This is the Windows
6902 2000 English version of the message. */
6903 const char *reason = "No mapping for the Unicode character exists "
6904 "in the target code page.";
6905 /* each step cannot decode more than 1 character, but a character can be
6906 represented as a surrogate pair */
6907 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006908 int insize;
6909 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006910 PyObject *errorHandler = NULL;
6911 PyObject *exc = NULL;
6912 PyObject *encoding_obj = NULL;
6913 char *encoding;
6914 DWORD err;
6915 int ret = -1;
6916
6917 assert(size > 0);
6918
6919 encoding = code_page_name(code_page, &encoding_obj);
6920 if (encoding == NULL)
6921 return -1;
6922
Victor Stinner7d00cc12014-03-17 23:08:06 +01006923 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006924 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6925 UnicodeDecodeError. */
6926 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6927 if (exc != NULL) {
6928 PyCodec_StrictErrors(exc);
6929 Py_CLEAR(exc);
6930 }
6931 goto error;
6932 }
6933
6934 if (*v == NULL) {
6935 /* Create unicode object */
6936 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6937 PyErr_NoMemory();
6938 goto error;
6939 }
Victor Stinnerab595942011-12-17 04:59:06 +01006940 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006941 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006942 if (*v == NULL)
6943 goto error;
6944 startout = PyUnicode_AS_UNICODE(*v);
6945 }
6946 else {
6947 /* Extend unicode object */
6948 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6949 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6950 PyErr_NoMemory();
6951 goto error;
6952 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006953 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006954 goto error;
6955 startout = PyUnicode_AS_UNICODE(*v) + n;
6956 }
6957
6958 /* Decode the byte string character per character */
6959 out = startout;
6960 while (in < endin)
6961 {
6962 /* Decode a character */
6963 insize = 1;
6964 do
6965 {
6966 outsize = MultiByteToWideChar(code_page, flags,
6967 in, insize,
6968 buffer, Py_ARRAY_LENGTH(buffer));
6969 if (outsize > 0)
6970 break;
6971 err = GetLastError();
6972 if (err != ERROR_NO_UNICODE_TRANSLATION
6973 && err != ERROR_INSUFFICIENT_BUFFER)
6974 {
6975 PyErr_SetFromWindowsErr(0);
6976 goto error;
6977 }
6978 insize++;
6979 }
6980 /* 4=maximum length of a UTF-8 sequence */
6981 while (insize <= 4 && (in + insize) <= endin);
6982
6983 if (outsize <= 0) {
6984 Py_ssize_t startinpos, endinpos, outpos;
6985
Victor Stinner7d00cc12014-03-17 23:08:06 +01006986 /* last character in partial decode? */
6987 if (in + insize >= endin && !final)
6988 break;
6989
Victor Stinner3a50e702011-10-18 21:21:00 +02006990 startinpos = in - startin;
6991 endinpos = startinpos + 1;
6992 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006993 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006994 errors, &errorHandler,
6995 encoding, reason,
6996 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006997 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006998 {
6999 goto error;
7000 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007001 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007002 }
7003 else {
7004 in += insize;
7005 memcpy(out, buffer, outsize * sizeof(wchar_t));
7006 out += outsize;
7007 }
7008 }
7009
7010 /* write a NUL character at the end */
7011 *out = 0;
7012
7013 /* Extend unicode object */
7014 outsize = out - startout;
7015 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007016 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 goto error;
Victor Stinner7d00cc12014-03-17 23:08:06 +01007018 ret = in - startin;
Victor Stinner3a50e702011-10-18 21:21:00 +02007019
7020error:
7021 Py_XDECREF(encoding_obj);
7022 Py_XDECREF(errorHandler);
7023 Py_XDECREF(exc);
7024 return ret;
7025}
7026
Victor Stinner3a50e702011-10-18 21:21:00 +02007027static PyObject *
7028decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007029 const char *s, Py_ssize_t size,
7030 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007031{
Victor Stinner76a31a62011-11-04 00:05:13 +01007032 PyObject *v = NULL;
7033 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007034
Victor Stinner3a50e702011-10-18 21:21:00 +02007035 if (code_page < 0) {
7036 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7037 return NULL;
7038 }
7039
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007042
Victor Stinner76a31a62011-11-04 00:05:13 +01007043 do
7044 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007046 if (size > INT_MAX) {
7047 chunk_size = INT_MAX;
7048 final = 0;
7049 done = 0;
7050 }
7051 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007052#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007053 {
7054 chunk_size = (int)size;
7055 final = (consumed == NULL);
7056 done = 1;
7057 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058
Victor Stinner76a31a62011-11-04 00:05:13 +01007059 if (chunk_size == 0 && done) {
7060 if (v != NULL)
7061 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007062 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007063 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064
Victor Stinner76a31a62011-11-04 00:05:13 +01007065 converted = decode_code_page_strict(code_page, &v,
7066 s, chunk_size);
7067 if (converted == -2)
7068 converted = decode_code_page_errors(code_page, &v,
7069 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007070 errors, final);
7071 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007072
7073 if (converted < 0) {
7074 Py_XDECREF(v);
7075 return NULL;
7076 }
7077
7078 if (consumed)
7079 *consumed += converted;
7080
7081 s += converted;
7082 size -= converted;
7083 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007084
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007085 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086}
7087
Alexander Belopolsky40018472011-02-26 01:02:56 +00007088PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007089PyUnicode_DecodeCodePageStateful(int code_page,
7090 const char *s,
7091 Py_ssize_t size,
7092 const char *errors,
7093 Py_ssize_t *consumed)
7094{
7095 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7096}
7097
7098PyObject *
7099PyUnicode_DecodeMBCSStateful(const char *s,
7100 Py_ssize_t size,
7101 const char *errors,
7102 Py_ssize_t *consumed)
7103{
7104 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7105}
7106
7107PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007108PyUnicode_DecodeMBCS(const char *s,
7109 Py_ssize_t size,
7110 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007111{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7113}
7114
Victor Stinner3a50e702011-10-18 21:21:00 +02007115static DWORD
7116encode_code_page_flags(UINT code_page, const char *errors)
7117{
7118 if (code_page == CP_UTF8) {
7119 if (winver.dwMajorVersion >= 6)
7120 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7121 and later */
7122 return WC_ERR_INVALID_CHARS;
7123 else
7124 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7125 return 0;
7126 }
7127 else if (code_page == CP_UTF7) {
7128 /* CP_UTF7 only supports flags=0 */
7129 return 0;
7130 }
7131 else {
7132 if (errors != NULL && strcmp(errors, "replace") == 0)
7133 return 0;
7134 else
7135 return WC_NO_BEST_FIT_CHARS;
7136 }
7137}
7138
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 * Encode a Unicode string to a Windows code page into a byte string in strict
7141 * mode.
7142 *
7143 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007144 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007145 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007146static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007147encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007148 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007149 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150{
Victor Stinner554f3f02010-06-16 23:33:54 +00007151 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007152 BOOL *pusedDefaultChar = &usedDefaultChar;
7153 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007154 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007155 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007156 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 const DWORD flags = encode_code_page_flags(code_page, NULL);
7158 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007159 /* Create a substring so that we can get the UTF-16 representation
7160 of just the slice under consideration. */
7161 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007162
Martin v. Löwis3d325192011-11-04 18:23:06 +01007163 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007164
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007166 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007168 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007169
Victor Stinner2fc507f2011-11-04 20:06:39 +01007170 substring = PyUnicode_Substring(unicode, offset, offset+len);
7171 if (substring == NULL)
7172 return -1;
7173 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7174 if (p == NULL) {
7175 Py_DECREF(substring);
7176 return -1;
7177 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007178 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007179
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007180 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007182 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 NULL, 0,
7184 NULL, pusedDefaultChar);
7185 if (outsize <= 0)
7186 goto error;
7187 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007188 if (pusedDefaultChar && *pusedDefaultChar) {
7189 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007191 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007192
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007196 if (*outbytes == NULL) {
7197 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007199 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201 }
7202 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 const Py_ssize_t n = PyBytes_Size(*outbytes);
7205 if (outsize > PY_SSIZE_T_MAX - n) {
7206 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007207 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007210 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7211 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007213 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007215 }
7216
7217 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007219 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 out, outsize,
7221 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007222 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 if (outsize <= 0)
7224 goto error;
7225 if (pusedDefaultChar && *pusedDefaultChar)
7226 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007227 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007228
Victor Stinner3a50e702011-10-18 21:21:00 +02007229error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007231 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7232 return -2;
7233 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007234 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007235}
7236
Victor Stinner3a50e702011-10-18 21:21:00 +02007237/*
7238 * Encode a Unicode string to a Windows code page into a byte string using a
7239 * error handler.
7240 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007241 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 * -1 on other error.
7243 */
7244static int
7245encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007246 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007247 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007248{
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007250 Py_ssize_t pos = unicode_offset;
7251 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 /* Ideally, we should get reason from FormatMessage. This is the Windows
7253 2000 English version of the message. */
7254 const char *reason = "invalid character";
7255 /* 4=maximum length of a UTF-8 sequence */
7256 char buffer[4];
7257 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7258 Py_ssize_t outsize;
7259 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 PyObject *errorHandler = NULL;
7261 PyObject *exc = NULL;
7262 PyObject *encoding_obj = NULL;
7263 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007264 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 PyObject *rep;
7266 int ret = -1;
7267
7268 assert(insize > 0);
7269
7270 encoding = code_page_name(code_page, &encoding_obj);
7271 if (encoding == NULL)
7272 return -1;
7273
7274 if (errors == NULL || strcmp(errors, "strict") == 0) {
7275 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7276 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007277 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 if (exc != NULL) {
7279 PyCodec_StrictErrors(exc);
7280 Py_DECREF(exc);
7281 }
7282 Py_XDECREF(encoding_obj);
7283 return -1;
7284 }
7285
7286 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7287 pusedDefaultChar = &usedDefaultChar;
7288 else
7289 pusedDefaultChar = NULL;
7290
7291 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7292 PyErr_NoMemory();
7293 goto error;
7294 }
7295 outsize = insize * Py_ARRAY_LENGTH(buffer);
7296
7297 if (*outbytes == NULL) {
7298 /* Create string object */
7299 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7300 if (*outbytes == NULL)
7301 goto error;
7302 out = PyBytes_AS_STRING(*outbytes);
7303 }
7304 else {
7305 /* Extend string object */
7306 Py_ssize_t n = PyBytes_Size(*outbytes);
7307 if (n > PY_SSIZE_T_MAX - outsize) {
7308 PyErr_NoMemory();
7309 goto error;
7310 }
7311 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7312 goto error;
7313 out = PyBytes_AS_STRING(*outbytes) + n;
7314 }
7315
7316 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007317 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007319 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7320 wchar_t chars[2];
7321 int charsize;
7322 if (ch < 0x10000) {
7323 chars[0] = (wchar_t)ch;
7324 charsize = 1;
7325 }
7326 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007327 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7328 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007329 charsize = 2;
7330 }
7331
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007333 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 buffer, Py_ARRAY_LENGTH(buffer),
7335 NULL, pusedDefaultChar);
7336 if (outsize > 0) {
7337 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7338 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007339 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 memcpy(out, buffer, outsize);
7341 out += outsize;
7342 continue;
7343 }
7344 }
7345 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7346 PyErr_SetFromWindowsErr(0);
7347 goto error;
7348 }
7349
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 rep = unicode_encode_call_errorhandler(
7351 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007352 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007353 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 if (rep == NULL)
7355 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007356 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007357
7358 if (PyBytes_Check(rep)) {
7359 outsize = PyBytes_GET_SIZE(rep);
7360 if (outsize != 1) {
7361 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7362 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7363 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7364 Py_DECREF(rep);
7365 goto error;
7366 }
7367 out = PyBytes_AS_STRING(*outbytes) + offset;
7368 }
7369 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7370 out += outsize;
7371 }
7372 else {
7373 Py_ssize_t i;
7374 enum PyUnicode_Kind kind;
7375 void *data;
7376
Benjamin Petersonbac79492012-01-14 13:34:47 -05007377 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007378 Py_DECREF(rep);
7379 goto error;
7380 }
7381
7382 outsize = PyUnicode_GET_LENGTH(rep);
7383 if (outsize != 1) {
7384 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7385 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7386 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7387 Py_DECREF(rep);
7388 goto error;
7389 }
7390 out = PyBytes_AS_STRING(*outbytes) + offset;
7391 }
7392 kind = PyUnicode_KIND(rep);
7393 data = PyUnicode_DATA(rep);
7394 for (i=0; i < outsize; i++) {
7395 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7396 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007397 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007398 encoding, unicode,
7399 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 "unable to encode error handler result to ASCII");
7401 Py_DECREF(rep);
7402 goto error;
7403 }
7404 *out = (unsigned char)ch;
7405 out++;
7406 }
7407 }
7408 Py_DECREF(rep);
7409 }
7410 /* write a NUL byte */
7411 *out = 0;
7412 outsize = out - PyBytes_AS_STRING(*outbytes);
7413 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7414 if (_PyBytes_Resize(outbytes, outsize) < 0)
7415 goto error;
7416 ret = 0;
7417
7418error:
7419 Py_XDECREF(encoding_obj);
7420 Py_XDECREF(errorHandler);
7421 Py_XDECREF(exc);
7422 return ret;
7423}
7424
Victor Stinner3a50e702011-10-18 21:21:00 +02007425static PyObject *
7426encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007427 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 const char *errors)
7429{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007430 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007432 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007433 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007434
Benjamin Petersonbac79492012-01-14 13:34:47 -05007435 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007436 return NULL;
7437 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007438
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 if (code_page < 0) {
7440 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7441 return NULL;
7442 }
7443
Martin v. Löwis3d325192011-11-04 18:23:06 +01007444 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007445 return PyBytes_FromStringAndSize(NULL, 0);
7446
Victor Stinner7581cef2011-11-03 22:32:33 +01007447 offset = 0;
7448 do
7449 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007450#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007451 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007452 chunks. */
7453 if (len > INT_MAX/2) {
7454 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 done = 0;
7456 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007457 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007459 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007460 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007461 done = 1;
7462 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463
Victor Stinner76a31a62011-11-04 00:05:13 +01007464 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007465 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007466 errors);
7467 if (ret == -2)
7468 ret = encode_code_page_errors(code_page, &outbytes,
7469 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007470 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007471 if (ret < 0) {
7472 Py_XDECREF(outbytes);
7473 return NULL;
7474 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475
Victor Stinner7581cef2011-11-03 22:32:33 +01007476 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007477 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007478 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 return outbytes;
7481}
7482
7483PyObject *
7484PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7485 Py_ssize_t size,
7486 const char *errors)
7487{
Victor Stinner7581cef2011-11-03 22:32:33 +01007488 PyObject *unicode, *res;
7489 unicode = PyUnicode_FromUnicode(p, size);
7490 if (unicode == NULL)
7491 return NULL;
7492 res = encode_code_page(CP_ACP, unicode, errors);
7493 Py_DECREF(unicode);
7494 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007495}
7496
7497PyObject *
7498PyUnicode_EncodeCodePage(int code_page,
7499 PyObject *unicode,
7500 const char *errors)
7501{
Victor Stinner7581cef2011-11-03 22:32:33 +01007502 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007503}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007504
Alexander Belopolsky40018472011-02-26 01:02:56 +00007505PyObject *
7506PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007507{
7508 if (!PyUnicode_Check(unicode)) {
7509 PyErr_BadArgument();
7510 return NULL;
7511 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007512 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007513}
7514
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007515#undef NEED_RETRY
7516
Victor Stinner99b95382011-07-04 14:23:54 +02007517#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007518
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519/* --- Character Mapping Codec -------------------------------------------- */
7520
Victor Stinnerfb161b12013-04-18 01:44:27 +02007521static int
7522charmap_decode_string(const char *s,
7523 Py_ssize_t size,
7524 PyObject *mapping,
7525 const char *errors,
7526 _PyUnicodeWriter *writer)
7527{
7528 const char *starts = s;
7529 const char *e;
7530 Py_ssize_t startinpos, endinpos;
7531 PyObject *errorHandler = NULL, *exc = NULL;
7532 Py_ssize_t maplen;
7533 enum PyUnicode_Kind mapkind;
7534 void *mapdata;
7535 Py_UCS4 x;
7536 unsigned char ch;
7537
7538 if (PyUnicode_READY(mapping) == -1)
7539 return -1;
7540
7541 maplen = PyUnicode_GET_LENGTH(mapping);
7542 mapdata = PyUnicode_DATA(mapping);
7543 mapkind = PyUnicode_KIND(mapping);
7544
7545 e = s + size;
7546
7547 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7548 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7549 * is disabled in encoding aliases, latin1 is preferred because
7550 * its implementation is faster. */
7551 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7552 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7553 Py_UCS4 maxchar = writer->maxchar;
7554
7555 assert (writer->kind == PyUnicode_1BYTE_KIND);
7556 while (s < e) {
7557 ch = *s;
7558 x = mapdata_ucs1[ch];
7559 if (x > maxchar) {
7560 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7561 goto onError;
7562 maxchar = writer->maxchar;
7563 outdata = (Py_UCS1 *)writer->data;
7564 }
7565 outdata[writer->pos] = x;
7566 writer->pos++;
7567 ++s;
7568 }
7569 return 0;
7570 }
7571
7572 while (s < e) {
7573 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7574 enum PyUnicode_Kind outkind = writer->kind;
7575 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7576 if (outkind == PyUnicode_1BYTE_KIND) {
7577 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7578 Py_UCS4 maxchar = writer->maxchar;
7579 while (s < e) {
7580 ch = *s;
7581 x = mapdata_ucs2[ch];
7582 if (x > maxchar)
7583 goto Error;
7584 outdata[writer->pos] = x;
7585 writer->pos++;
7586 ++s;
7587 }
7588 break;
7589 }
7590 else if (outkind == PyUnicode_2BYTE_KIND) {
7591 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7592 while (s < e) {
7593 ch = *s;
7594 x = mapdata_ucs2[ch];
7595 if (x == 0xFFFE)
7596 goto Error;
7597 outdata[writer->pos] = x;
7598 writer->pos++;
7599 ++s;
7600 }
7601 break;
7602 }
7603 }
7604 ch = *s;
7605
7606 if (ch < maplen)
7607 x = PyUnicode_READ(mapkind, mapdata, ch);
7608 else
7609 x = 0xfffe; /* invalid value */
7610Error:
7611 if (x == 0xfffe)
7612 {
7613 /* undefined mapping */
7614 startinpos = s-starts;
7615 endinpos = startinpos+1;
7616 if (unicode_decode_call_errorhandler_writer(
7617 errors, &errorHandler,
7618 "charmap", "character maps to <undefined>",
7619 &starts, &e, &startinpos, &endinpos, &exc, &s,
7620 writer)) {
7621 goto onError;
7622 }
7623 continue;
7624 }
7625
7626 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7627 goto onError;
7628 ++s;
7629 }
7630 Py_XDECREF(errorHandler);
7631 Py_XDECREF(exc);
7632 return 0;
7633
7634onError:
7635 Py_XDECREF(errorHandler);
7636 Py_XDECREF(exc);
7637 return -1;
7638}
7639
7640static int
7641charmap_decode_mapping(const char *s,
7642 Py_ssize_t size,
7643 PyObject *mapping,
7644 const char *errors,
7645 _PyUnicodeWriter *writer)
7646{
7647 const char *starts = s;
7648 const char *e;
7649 Py_ssize_t startinpos, endinpos;
7650 PyObject *errorHandler = NULL, *exc = NULL;
7651 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007652 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007653
7654 e = s + size;
7655
7656 while (s < e) {
7657 ch = *s;
7658
7659 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7660 key = PyLong_FromLong((long)ch);
7661 if (key == NULL)
7662 goto onError;
7663
7664 item = PyObject_GetItem(mapping, key);
7665 Py_DECREF(key);
7666 if (item == NULL) {
7667 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7668 /* No mapping found means: mapping is undefined. */
7669 PyErr_Clear();
7670 goto Undefined;
7671 } else
7672 goto onError;
7673 }
7674
7675 /* Apply mapping */
7676 if (item == Py_None)
7677 goto Undefined;
7678 if (PyLong_Check(item)) {
7679 long value = PyLong_AS_LONG(item);
7680 if (value == 0xFFFE)
7681 goto Undefined;
7682 if (value < 0 || value > MAX_UNICODE) {
7683 PyErr_Format(PyExc_TypeError,
7684 "character mapping must be in range(0x%lx)",
7685 (unsigned long)MAX_UNICODE + 1);
7686 goto onError;
7687 }
7688
7689 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7690 goto onError;
7691 }
7692 else if (PyUnicode_Check(item)) {
7693 if (PyUnicode_READY(item) == -1)
7694 goto onError;
7695 if (PyUnicode_GET_LENGTH(item) == 1) {
7696 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7697 if (value == 0xFFFE)
7698 goto Undefined;
7699 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7700 goto onError;
7701 }
7702 else {
7703 writer->overallocate = 1;
7704 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7705 goto onError;
7706 }
7707 }
7708 else {
7709 /* wrong return value */
7710 PyErr_SetString(PyExc_TypeError,
7711 "character mapping must return integer, None or str");
7712 goto onError;
7713 }
7714 Py_CLEAR(item);
7715 ++s;
7716 continue;
7717
7718Undefined:
7719 /* undefined mapping */
7720 Py_CLEAR(item);
7721 startinpos = s-starts;
7722 endinpos = startinpos+1;
7723 if (unicode_decode_call_errorhandler_writer(
7724 errors, &errorHandler,
7725 "charmap", "character maps to <undefined>",
7726 &starts, &e, &startinpos, &endinpos, &exc, &s,
7727 writer)) {
7728 goto onError;
7729 }
7730 }
7731 Py_XDECREF(errorHandler);
7732 Py_XDECREF(exc);
7733 return 0;
7734
7735onError:
7736 Py_XDECREF(item);
7737 Py_XDECREF(errorHandler);
7738 Py_XDECREF(exc);
7739 return -1;
7740}
7741
Alexander Belopolsky40018472011-02-26 01:02:56 +00007742PyObject *
7743PyUnicode_DecodeCharmap(const char *s,
7744 Py_ssize_t size,
7745 PyObject *mapping,
7746 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007748 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007749
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 /* Default to Latin-1 */
7751 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007755 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007756 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007757 writer.min_length = size;
7758 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007760
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007761 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007762 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7763 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007764 }
7765 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007766 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7767 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007769 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007770
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007772 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 return NULL;
7774}
7775
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007776/* Charmap encoding: the lookup table */
7777
Alexander Belopolsky40018472011-02-26 01:02:56 +00007778struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 PyObject_HEAD
7780 unsigned char level1[32];
7781 int count2, count3;
7782 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783};
7784
7785static PyObject*
7786encoding_map_size(PyObject *obj, PyObject* args)
7787{
7788 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007789 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007791}
7792
7793static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007794 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 PyDoc_STR("Return the size (in bytes) of this object") },
7796 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007797};
7798
7799static void
7800encoding_map_dealloc(PyObject* o)
7801{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007802 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007803}
7804
7805static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007806 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 "EncodingMap", /*tp_name*/
7808 sizeof(struct encoding_map), /*tp_basicsize*/
7809 0, /*tp_itemsize*/
7810 /* methods */
7811 encoding_map_dealloc, /*tp_dealloc*/
7812 0, /*tp_print*/
7813 0, /*tp_getattr*/
7814 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007815 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 0, /*tp_repr*/
7817 0, /*tp_as_number*/
7818 0, /*tp_as_sequence*/
7819 0, /*tp_as_mapping*/
7820 0, /*tp_hash*/
7821 0, /*tp_call*/
7822 0, /*tp_str*/
7823 0, /*tp_getattro*/
7824 0, /*tp_setattro*/
7825 0, /*tp_as_buffer*/
7826 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7827 0, /*tp_doc*/
7828 0, /*tp_traverse*/
7829 0, /*tp_clear*/
7830 0, /*tp_richcompare*/
7831 0, /*tp_weaklistoffset*/
7832 0, /*tp_iter*/
7833 0, /*tp_iternext*/
7834 encoding_map_methods, /*tp_methods*/
7835 0, /*tp_members*/
7836 0, /*tp_getset*/
7837 0, /*tp_base*/
7838 0, /*tp_dict*/
7839 0, /*tp_descr_get*/
7840 0, /*tp_descr_set*/
7841 0, /*tp_dictoffset*/
7842 0, /*tp_init*/
7843 0, /*tp_alloc*/
7844 0, /*tp_new*/
7845 0, /*tp_free*/
7846 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007847};
7848
7849PyObject*
7850PyUnicode_BuildEncodingMap(PyObject* string)
7851{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007852 PyObject *result;
7853 struct encoding_map *mresult;
7854 int i;
7855 int need_dict = 0;
7856 unsigned char level1[32];
7857 unsigned char level2[512];
7858 unsigned char *mlevel1, *mlevel2, *mlevel3;
7859 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 int kind;
7861 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007862 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007864
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007865 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007866 PyErr_BadArgument();
7867 return NULL;
7868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007869 kind = PyUnicode_KIND(string);
7870 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007871 length = PyUnicode_GET_LENGTH(string);
7872 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873 memset(level1, 0xFF, sizeof level1);
7874 memset(level2, 0xFF, sizeof level2);
7875
7876 /* If there isn't a one-to-one mapping of NULL to \0,
7877 or if there are non-BMP characters, we need to use
7878 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007881 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007882 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007883 ch = PyUnicode_READ(kind, data, i);
7884 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885 need_dict = 1;
7886 break;
7887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007888 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889 /* unmapped character */
7890 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007891 l1 = ch >> 11;
7892 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 if (level1[l1] == 0xFF)
7894 level1[l1] = count2++;
7895 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007896 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007897 }
7898
7899 if (count2 >= 0xFF || count3 >= 0xFF)
7900 need_dict = 1;
7901
7902 if (need_dict) {
7903 PyObject *result = PyDict_New();
7904 PyObject *key, *value;
7905 if (!result)
7906 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007907 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007909 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007910 if (!key || !value)
7911 goto failed1;
7912 if (PyDict_SetItem(result, key, value) == -1)
7913 goto failed1;
7914 Py_DECREF(key);
7915 Py_DECREF(value);
7916 }
7917 return result;
7918 failed1:
7919 Py_XDECREF(key);
7920 Py_XDECREF(value);
7921 Py_DECREF(result);
7922 return NULL;
7923 }
7924
7925 /* Create a three-level trie */
7926 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7927 16*count2 + 128*count3 - 1);
7928 if (!result)
7929 return PyErr_NoMemory();
7930 PyObject_Init(result, &EncodingMapType);
7931 mresult = (struct encoding_map*)result;
7932 mresult->count2 = count2;
7933 mresult->count3 = count3;
7934 mlevel1 = mresult->level1;
7935 mlevel2 = mresult->level23;
7936 mlevel3 = mresult->level23 + 16*count2;
7937 memcpy(mlevel1, level1, 32);
7938 memset(mlevel2, 0xFF, 16*count2);
7939 memset(mlevel3, 0, 128*count3);
7940 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007941 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007942 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007943 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7944 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945 /* unmapped character */
7946 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007947 o1 = ch>>11;
7948 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 i2 = 16*mlevel1[o1] + o2;
7950 if (mlevel2[i2] == 0xFF)
7951 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007952 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007953 i3 = 128*mlevel2[i2] + o3;
7954 mlevel3[i3] = i;
7955 }
7956 return result;
7957}
7958
7959static int
Victor Stinner22168992011-11-20 17:09:18 +01007960encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007961{
7962 struct encoding_map *map = (struct encoding_map*)mapping;
7963 int l1 = c>>11;
7964 int l2 = (c>>7) & 0xF;
7965 int l3 = c & 0x7F;
7966 int i;
7967
Victor Stinner22168992011-11-20 17:09:18 +01007968 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007970 if (c == 0)
7971 return 0;
7972 /* level 1*/
7973 i = map->level1[l1];
7974 if (i == 0xFF) {
7975 return -1;
7976 }
7977 /* level 2*/
7978 i = map->level23[16*i+l2];
7979 if (i == 0xFF) {
7980 return -1;
7981 }
7982 /* level 3 */
7983 i = map->level23[16*map->count2 + 128*i + l3];
7984 if (i == 0) {
7985 return -1;
7986 }
7987 return i;
7988}
7989
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007990/* Lookup the character ch in the mapping. If the character
7991 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007992 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007993static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007994charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995{
Christian Heimes217cfd12007-12-02 14:31:20 +00007996 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007997 PyObject *x;
7998
7999 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001 x = PyObject_GetItem(mapping, w);
8002 Py_DECREF(w);
8003 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8005 /* No mapping found means: mapping is undefined. */
8006 PyErr_Clear();
8007 x = Py_None;
8008 Py_INCREF(x);
8009 return x;
8010 } else
8011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008013 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008015 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 long value = PyLong_AS_LONG(x);
8017 if (value < 0 || value > 255) {
8018 PyErr_SetString(PyExc_TypeError,
8019 "character mapping must be in range(256)");
8020 Py_DECREF(x);
8021 return NULL;
8022 }
8023 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008025 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 /* wrong return value */
8029 PyErr_Format(PyExc_TypeError,
8030 "character mapping must return integer, bytes or None, not %.400s",
8031 x->ob_type->tp_name);
8032 Py_DECREF(x);
8033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034 }
8035}
8036
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008038charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008040 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8041 /* exponentially overallocate to minimize reallocations */
8042 if (requiredsize < 2*outsize)
8043 requiredsize = 2*outsize;
8044 if (_PyBytes_Resize(outobj, requiredsize))
8045 return -1;
8046 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047}
8048
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008051} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008053 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008054 space is available. Return a new reference to the object that
8055 was put in the output buffer, or Py_None, if the mapping was undefined
8056 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008057 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008058static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008059charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008060 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008061{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062 PyObject *rep;
8063 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008064 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008065
Christian Heimes90aa7642007-12-19 02:45:37 +00008066 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008069 if (res == -1)
8070 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 if (outsize<requiredsize)
8072 if (charmapencode_resize(outobj, outpos, requiredsize))
8073 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008074 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 outstart[(*outpos)++] = (char)res;
8076 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008077 }
8078
8079 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008082 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 Py_DECREF(rep);
8084 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008085 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 if (PyLong_Check(rep)) {
8087 Py_ssize_t requiredsize = *outpos+1;
8088 if (outsize<requiredsize)
8089 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8090 Py_DECREF(rep);
8091 return enc_EXCEPTION;
8092 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008093 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008095 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 else {
8097 const char *repchars = PyBytes_AS_STRING(rep);
8098 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8099 Py_ssize_t requiredsize = *outpos+repsize;
8100 if (outsize<requiredsize)
8101 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8102 Py_DECREF(rep);
8103 return enc_EXCEPTION;
8104 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008105 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 memcpy(outstart + *outpos, repchars, repsize);
8107 *outpos += repsize;
8108 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 Py_DECREF(rep);
8111 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112}
8113
8114/* handle an error in PyUnicode_EncodeCharmap
8115 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008116static int
8117charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008118 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008120 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008121 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122{
8123 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008124 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008125 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008126 enum PyUnicode_Kind kind;
8127 void *data;
8128 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008130 Py_ssize_t collstartpos = *inpos;
8131 Py_ssize_t collendpos = *inpos+1;
8132 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 char *encoding = "charmap";
8134 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008136 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008137 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008138
Benjamin Petersonbac79492012-01-14 13:34:47 -05008139 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008140 return -1;
8141 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142 /* find all unencodable characters */
8143 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008145 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008146 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008147 val = encoding_map_lookup(ch, mapping);
8148 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 break;
8150 ++collendpos;
8151 continue;
8152 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008153
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008154 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8155 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 if (rep==NULL)
8157 return -1;
8158 else if (rep!=Py_None) {
8159 Py_DECREF(rep);
8160 break;
8161 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008162 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008164 }
8165 /* cache callback name lookup
8166 * (if not done yet, i.e. it's the first error) */
8167 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 if ((errors==NULL) || (!strcmp(errors, "strict")))
8169 *known_errorHandler = 1;
8170 else if (!strcmp(errors, "replace"))
8171 *known_errorHandler = 2;
8172 else if (!strcmp(errors, "ignore"))
8173 *known_errorHandler = 3;
8174 else if (!strcmp(errors, "xmlcharrefreplace"))
8175 *known_errorHandler = 4;
8176 else
8177 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008178 }
8179 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008181 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182 return -1;
8183 case 2: /* replace */
8184 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 x = charmapencode_output('?', mapping, res, respos);
8186 if (x==enc_EXCEPTION) {
8187 return -1;
8188 }
8189 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008190 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 return -1;
8192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 }
8194 /* fall through */
8195 case 3: /* ignore */
8196 *inpos = collendpos;
8197 break;
8198 case 4: /* xmlcharrefreplace */
8199 /* generate replacement (temporarily (mis)uses p) */
8200 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 char buffer[2+29+1+1];
8202 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008203 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 for (cp = buffer; *cp; ++cp) {
8205 x = charmapencode_output(*cp, mapping, res, respos);
8206 if (x==enc_EXCEPTION)
8207 return -1;
8208 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008209 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return -1;
8211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 }
8213 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 *inpos = collendpos;
8215 break;
8216 default:
8217 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008218 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008220 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008222 if (PyBytes_Check(repunicode)) {
8223 /* Directly copy bytes result to output. */
8224 Py_ssize_t outsize = PyBytes_Size(*res);
8225 Py_ssize_t requiredsize;
8226 repsize = PyBytes_Size(repunicode);
8227 requiredsize = *respos + repsize;
8228 if (requiredsize > outsize)
8229 /* Make room for all additional bytes. */
8230 if (charmapencode_resize(res, respos, requiredsize)) {
8231 Py_DECREF(repunicode);
8232 return -1;
8233 }
8234 memcpy(PyBytes_AsString(*res) + *respos,
8235 PyBytes_AsString(repunicode), repsize);
8236 *respos += repsize;
8237 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008238 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008239 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008242 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008243 Py_DECREF(repunicode);
8244 return -1;
8245 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008246 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008247 data = PyUnicode_DATA(repunicode);
8248 kind = PyUnicode_KIND(repunicode);
8249 for (index = 0; index < repsize; index++) {
8250 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8251 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008253 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 return -1;
8255 }
8256 else if (x==enc_FAILED) {
8257 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008258 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 return -1;
8260 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 }
8262 *inpos = newpos;
8263 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 }
8265 return 0;
8266}
8267
Alexander Belopolsky40018472011-02-26 01:02:56 +00008268PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008269_PyUnicode_EncodeCharmap(PyObject *unicode,
8270 PyObject *mapping,
8271 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 /* output object */
8274 PyObject *res = NULL;
8275 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008276 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008277 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008279 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 PyObject *errorHandler = NULL;
8281 PyObject *exc = NULL;
8282 /* the following variable is used for caching string comparisons
8283 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8284 * 3=ignore, 4=xmlcharrefreplace */
8285 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008286 void *data;
8287 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288
Benjamin Petersonbac79492012-01-14 13:34:47 -05008289 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008290 return NULL;
8291 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008292 data = PyUnicode_DATA(unicode);
8293 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008294
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 /* Default to Latin-1 */
8296 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008297 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 /* allocate enough for a simple encoding without
8300 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008301 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 if (res == NULL)
8303 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008304 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008308 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008310 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 if (x==enc_EXCEPTION) /* error */
8312 goto onError;
8313 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008314 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 &exc,
8316 &known_errorHandler, &errorHandler, errors,
8317 &res, &respos)) {
8318 goto onError;
8319 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008320 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 else
8322 /* done with this character => adjust input position */
8323 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008327 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008328 if (_PyBytes_Resize(&res, respos) < 0)
8329 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 Py_XDECREF(exc);
8332 Py_XDECREF(errorHandler);
8333 return res;
8334
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336 Py_XDECREF(res);
8337 Py_XDECREF(exc);
8338 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339 return NULL;
8340}
8341
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008342/* Deprecated */
8343PyObject *
8344PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8345 Py_ssize_t size,
8346 PyObject *mapping,
8347 const char *errors)
8348{
8349 PyObject *result;
8350 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8351 if (unicode == NULL)
8352 return NULL;
8353 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8354 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008355 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356}
8357
Alexander Belopolsky40018472011-02-26 01:02:56 +00008358PyObject *
8359PyUnicode_AsCharmapString(PyObject *unicode,
8360 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361{
8362 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 PyErr_BadArgument();
8364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008366 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367}
8368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370static void
8371make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008373 Py_ssize_t startpos, Py_ssize_t endpos,
8374 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 *exceptionObject = _PyUnicodeTranslateError_Create(
8378 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 }
8380 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8382 goto onError;
8383 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8384 goto onError;
8385 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8386 goto onError;
8387 return;
8388 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008389 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 }
8391}
8392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393/* error handling callback helper:
8394 build arguments, call the callback and check the arguments,
8395 put the result into newpos and return the replacement string, which
8396 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008397static PyObject *
8398unicode_translate_call_errorhandler(const char *errors,
8399 PyObject **errorHandler,
8400 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008402 Py_ssize_t startpos, Py_ssize_t endpos,
8403 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008405 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008407 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 PyObject *restuple;
8409 PyObject *resunicode;
8410
8411 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 }
8416
8417 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421
8422 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008427 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 Py_DECREF(restuple);
8429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 }
8431 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 &resunicode, &i_newpos)) {
8433 Py_DECREF(restuple);
8434 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008438 else
8439 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008441 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 Py_DECREF(restuple);
8443 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445 Py_INCREF(resunicode);
8446 Py_DECREF(restuple);
8447 return resunicode;
8448}
8449
8450/* Lookup the character ch in the mapping and put the result in result,
8451 which must be decrefed by the caller.
8452 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008453static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455{
Christian Heimes217cfd12007-12-02 14:31:20 +00008456 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 PyObject *x;
8458
8459 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 x = PyObject_GetItem(mapping, w);
8462 Py_DECREF(w);
8463 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8465 /* No mapping found means: use 1:1 mapping. */
8466 PyErr_Clear();
8467 *result = NULL;
8468 return 0;
8469 } else
8470 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471 }
8472 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 *result = x;
8474 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008476 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008478 if (value < 0 || value > MAX_UNICODE) {
8479 PyErr_Format(PyExc_ValueError,
8480 "character mapping must be in range(0x%x)",
8481 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 Py_DECREF(x);
8483 return -1;
8484 }
8485 *result = x;
8486 return 0;
8487 }
8488 else if (PyUnicode_Check(x)) {
8489 *result = x;
8490 return 0;
8491 }
8492 else {
8493 /* wrong return value */
8494 PyErr_SetString(PyExc_TypeError,
8495 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008496 Py_DECREF(x);
8497 return -1;
8498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499}
Victor Stinner1194ea02014-04-04 19:37:40 +02008500
8501/* lookup the character, write the result into the writer.
8502 Return 1 if the result was written into the writer, return 0 if the mapping
8503 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008504static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008505charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8506 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507{
Victor Stinner1194ea02014-04-04 19:37:40 +02008508 PyObject *item;
8509
8510 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008512
8513 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008515 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008518 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008520
8521 if (item == Py_None) {
8522 Py_DECREF(item);
8523 return 0;
8524 }
8525
8526 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008527 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8528 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8529 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008530 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8531 Py_DECREF(item);
8532 return -1;
8533 }
8534 Py_DECREF(item);
8535 return 1;
8536 }
8537
8538 if (!PyUnicode_Check(item)) {
8539 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008541 }
8542
8543 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8544 Py_DECREF(item);
8545 return -1;
8546 }
8547
8548 Py_DECREF(item);
8549 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550}
8551
Victor Stinner89a76ab2014-04-05 11:44:04 +02008552static int
8553unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8554 Py_UCS1 *translate)
8555{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008556 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008557 int ret = 0;
8558
Victor Stinner89a76ab2014-04-05 11:44:04 +02008559 if (charmaptranslate_lookup(ch, mapping, &item)) {
8560 return -1;
8561 }
8562
8563 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008564 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008565 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008566 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008567 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008568 /* not found => default to 1:1 mapping */
8569 translate[ch] = ch;
8570 return 1;
8571 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008572 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008573 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008574 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8575 used it */
8576 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008577 /* invalid character or character outside ASCII:
8578 skip the fast translate */
8579 goto exit;
8580 }
8581 translate[ch] = (Py_UCS1)replace;
8582 }
8583 else if (PyUnicode_Check(item)) {
8584 Py_UCS4 replace;
8585
8586 if (PyUnicode_READY(item) == -1) {
8587 Py_DECREF(item);
8588 return -1;
8589 }
8590 if (PyUnicode_GET_LENGTH(item) != 1)
8591 goto exit;
8592
8593 replace = PyUnicode_READ_CHAR(item, 0);
8594 if (replace > 127)
8595 goto exit;
8596 translate[ch] = (Py_UCS1)replace;
8597 }
8598 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008599 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008600 goto exit;
8601 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008602 ret = 1;
8603
Benjamin Peterson1365de72014-04-07 20:15:41 -04008604 exit:
8605 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008606 return ret;
8607}
8608
8609/* Fast path for ascii => ascii translation. Return 1 if the whole string
8610 was translated into writer, return 0 if the input string was partially
8611 translated into writer, raise an exception and return -1 on error. */
8612static int
8613unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008614 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008615{
Victor Stinner872b2912014-04-05 14:27:07 +02008616 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008617 Py_ssize_t len;
8618 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008619 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008620
8621 if (PyUnicode_READY(input) == -1)
8622 return -1;
8623 if (!PyUnicode_IS_ASCII(input))
8624 return 0;
8625 len = PyUnicode_GET_LENGTH(input);
8626
Victor Stinner872b2912014-04-05 14:27:07 +02008627 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008628
8629 in = PyUnicode_1BYTE_DATA(input);
8630 end = in + len;
8631
8632 assert(PyUnicode_IS_ASCII(writer->buffer));
8633 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8634 out = PyUnicode_1BYTE_DATA(writer->buffer);
8635
Victor Stinner872b2912014-04-05 14:27:07 +02008636 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008637 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008638 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008639 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008640 int translate = unicode_fast_translate_lookup(mapping, ch,
8641 ascii_table);
8642 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008643 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008644 if (translate == 0)
8645 goto exit;
8646 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008647 }
Victor Stinner872b2912014-04-05 14:27:07 +02008648 if (ch2 == 0xfe) {
8649 if (ignore)
8650 continue;
8651 goto exit;
8652 }
8653 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008654 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008655 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008656 }
Victor Stinner872b2912014-04-05 14:27:07 +02008657 res = 1;
8658
8659exit:
8660 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8661 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008662}
8663
Alexander Belopolsky40018472011-02-26 01:02:56 +00008664PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665_PyUnicode_TranslateCharmap(PyObject *input,
8666 PyObject *mapping,
8667 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008670 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 Py_ssize_t size, i;
8672 int kind;
8673 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008674 _PyUnicodeWriter writer;
8675 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 char *reason = "character maps to <undefined>";
8677 PyObject *errorHandler = NULL;
8678 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008679 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008680 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 PyErr_BadArgument();
8684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 if (PyUnicode_READY(input) == -1)
8688 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008689 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 kind = PyUnicode_KIND(input);
8691 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692
8693 if (size == 0) {
8694 Py_INCREF(input);
8695 return input;
8696 }
8697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 /* allocate enough for a simple 1:1 translation without
8699 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008700 _PyUnicodeWriter_Init(&writer);
8701 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703
Victor Stinner872b2912014-04-05 14:27:07 +02008704 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8705
8706 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008707 if (res < 0) {
8708 _PyUnicodeWriter_Dealloc(&writer);
8709 return NULL;
8710 }
8711 if (res == 1)
8712 return _PyUnicodeWriter_Finish(&writer);
8713
Victor Stinner89a76ab2014-04-05 11:44:04 +02008714 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008717 int translate;
8718 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8719 Py_ssize_t newpos;
8720 /* startpos for collecting untranslatable chars */
8721 Py_ssize_t collstart;
8722 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008723 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724
Victor Stinner1194ea02014-04-04 19:37:40 +02008725 ch = PyUnicode_READ(kind, data, i);
8726 translate = charmaptranslate_output(ch, mapping, &writer);
8727 if (translate < 0)
8728 goto onError;
8729
8730 if (translate != 0) {
8731 /* it worked => adjust input pointer */
8732 ++i;
8733 continue;
8734 }
8735
8736 /* untranslatable character */
8737 collstart = i;
8738 collend = i+1;
8739
8740 /* find all untranslatable characters */
8741 while (collend < size) {
8742 PyObject *x;
8743 ch = PyUnicode_READ(kind, data, collend);
8744 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008745 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008746 Py_XDECREF(x);
8747 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008749 ++collend;
8750 }
8751
8752 if (ignore) {
8753 i = collend;
8754 }
8755 else {
8756 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8757 reason, input, &exc,
8758 collstart, collend, &newpos);
8759 if (repunicode == NULL)
8760 goto onError;
8761 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008763 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008764 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008765 Py_DECREF(repunicode);
8766 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008767 }
8768 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769 Py_XDECREF(exc);
8770 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008771 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 Py_XDECREF(exc);
8776 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 return NULL;
8778}
8779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780/* Deprecated. Use PyUnicode_Translate instead. */
8781PyObject *
8782PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8783 Py_ssize_t size,
8784 PyObject *mapping,
8785 const char *errors)
8786{
Christian Heimes5f520f42012-09-11 14:03:25 +02008787 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8789 if (!unicode)
8790 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008791 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8792 Py_DECREF(unicode);
8793 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794}
8795
Alexander Belopolsky40018472011-02-26 01:02:56 +00008796PyObject *
8797PyUnicode_Translate(PyObject *str,
8798 PyObject *mapping,
8799 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800{
8801 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008802
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 str = PyUnicode_FromObject(str);
8804 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008805 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 Py_DECREF(str);
8808 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809}
Tim Petersced69f82003-09-16 20:30:58 +00008810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008812fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813{
8814 /* No need to call PyUnicode_READY(self) because this function is only
8815 called as a callback from fixup() which does it already. */
8816 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8817 const int kind = PyUnicode_KIND(self);
8818 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008819 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008820 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 Py_ssize_t i;
8822
8823 for (i = 0; i < len; ++i) {
8824 ch = PyUnicode_READ(kind, data, i);
8825 fixed = 0;
8826 if (ch > 127) {
8827 if (Py_UNICODE_ISSPACE(ch))
8828 fixed = ' ';
8829 else {
8830 const int decimal = Py_UNICODE_TODECIMAL(ch);
8831 if (decimal >= 0)
8832 fixed = '0' + decimal;
8833 }
8834 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008835 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008836 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 PyUnicode_WRITE(kind, data, i, fixed);
8838 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008839 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008840 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 }
8843
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008844 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845}
8846
8847PyObject *
8848_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8849{
8850 if (!PyUnicode_Check(unicode)) {
8851 PyErr_BadInternalCall();
8852 return NULL;
8853 }
8854 if (PyUnicode_READY(unicode) == -1)
8855 return NULL;
8856 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8857 /* If the string is already ASCII, just return the same string */
8858 Py_INCREF(unicode);
8859 return unicode;
8860 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008861 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862}
8863
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008864PyObject *
8865PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8866 Py_ssize_t length)
8867{
Victor Stinnerf0124502011-11-21 23:12:56 +01008868 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008869 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008870 Py_UCS4 maxchar;
8871 enum PyUnicode_Kind kind;
8872 void *data;
8873
Victor Stinner99d7ad02012-02-22 13:37:39 +01008874 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008875 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008876 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008877 if (ch > 127) {
8878 int decimal = Py_UNICODE_TODECIMAL(ch);
8879 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008880 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008881 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008882 }
8883 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008884
8885 /* Copy to a new string */
8886 decimal = PyUnicode_New(length, maxchar);
8887 if (decimal == NULL)
8888 return decimal;
8889 kind = PyUnicode_KIND(decimal);
8890 data = PyUnicode_DATA(decimal);
8891 /* Iterate over code points */
8892 for (i = 0; i < length; i++) {
8893 Py_UNICODE ch = s[i];
8894 if (ch > 127) {
8895 int decimal = Py_UNICODE_TODECIMAL(ch);
8896 if (decimal >= 0)
8897 ch = '0' + decimal;
8898 }
8899 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008901 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008902}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008903/* --- Decimal Encoder ---------------------------------------------------- */
8904
Alexander Belopolsky40018472011-02-26 01:02:56 +00008905int
8906PyUnicode_EncodeDecimal(Py_UNICODE *s,
8907 Py_ssize_t length,
8908 char *output,
8909 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008910{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008911 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008912 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008913 enum PyUnicode_Kind kind;
8914 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008915
8916 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 PyErr_BadArgument();
8918 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008919 }
8920
Victor Stinner42bf7752011-11-21 22:52:58 +01008921 unicode = PyUnicode_FromUnicode(s, length);
8922 if (unicode == NULL)
8923 return -1;
8924
Benjamin Petersonbac79492012-01-14 13:34:47 -05008925 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008926 Py_DECREF(unicode);
8927 return -1;
8928 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008929 kind = PyUnicode_KIND(unicode);
8930 data = PyUnicode_DATA(unicode);
8931
Victor Stinnerb84d7232011-11-22 01:50:07 +01008932 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008933 PyObject *exc;
8934 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008936 Py_ssize_t startpos;
8937
8938 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008939
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008941 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008942 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 decimal = Py_UNICODE_TODECIMAL(ch);
8946 if (decimal >= 0) {
8947 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008948 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 continue;
8950 }
8951 if (0 < ch && ch < 256) {
8952 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008953 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 continue;
8955 }
Victor Stinner6345be92011-11-25 20:09:01 +01008956
Victor Stinner42bf7752011-11-21 22:52:58 +01008957 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008958 exc = NULL;
8959 raise_encode_exception(&exc, "decimal", unicode,
8960 startpos, startpos+1,
8961 "invalid decimal Unicode string");
8962 Py_XDECREF(exc);
8963 Py_DECREF(unicode);
8964 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008965 }
8966 /* 0-terminate the output string */
8967 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008968 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008969 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008970}
8971
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972/* --- Helpers ------------------------------------------------------------ */
8973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008975any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 Py_ssize_t start,
8977 Py_ssize_t end)
8978{
8979 int kind1, kind2, kind;
8980 void *buf1, *buf2;
8981 Py_ssize_t len1, len2, result;
8982
8983 kind1 = PyUnicode_KIND(s1);
8984 kind2 = PyUnicode_KIND(s2);
8985 kind = kind1 > kind2 ? kind1 : kind2;
8986 buf1 = PyUnicode_DATA(s1);
8987 buf2 = PyUnicode_DATA(s2);
8988 if (kind1 != kind)
8989 buf1 = _PyUnicode_AsKind(s1, kind);
8990 if (!buf1)
8991 return -2;
8992 if (kind2 != kind)
8993 buf2 = _PyUnicode_AsKind(s2, kind);
8994 if (!buf2) {
8995 if (kind1 != kind) PyMem_Free(buf1);
8996 return -2;
8997 }
8998 len1 = PyUnicode_GET_LENGTH(s1);
8999 len2 = PyUnicode_GET_LENGTH(s2);
9000
Victor Stinner794d5672011-10-10 03:21:36 +02009001 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009002 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009003 case PyUnicode_1BYTE_KIND:
9004 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9005 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9006 else
9007 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9008 break;
9009 case PyUnicode_2BYTE_KIND:
9010 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9011 break;
9012 case PyUnicode_4BYTE_KIND:
9013 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9014 break;
9015 default:
9016 assert(0); result = -2;
9017 }
9018 }
9019 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009020 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009021 case PyUnicode_1BYTE_KIND:
9022 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9023 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9024 else
9025 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9026 break;
9027 case PyUnicode_2BYTE_KIND:
9028 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9029 break;
9030 case PyUnicode_4BYTE_KIND:
9031 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9032 break;
9033 default:
9034 assert(0); result = -2;
9035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 }
9037
9038 if (kind1 != kind)
9039 PyMem_Free(buf1);
9040 if (kind2 != kind)
9041 PyMem_Free(buf2);
9042
9043 return result;
9044}
9045
9046Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009047_PyUnicode_InsertThousandsGrouping(
9048 PyObject *unicode, Py_ssize_t index,
9049 Py_ssize_t n_buffer,
9050 void *digits, Py_ssize_t n_digits,
9051 Py_ssize_t min_width,
9052 const char *grouping, PyObject *thousands_sep,
9053 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054{
Victor Stinner41a863c2012-02-24 00:37:51 +01009055 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009056 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009057 Py_ssize_t thousands_sep_len;
9058 Py_ssize_t len;
9059
9060 if (unicode != NULL) {
9061 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009062 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009063 }
9064 else {
9065 kind = PyUnicode_1BYTE_KIND;
9066 data = NULL;
9067 }
9068 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9069 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9070 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9071 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009072 if (thousands_sep_kind < kind) {
9073 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9074 if (!thousands_sep_data)
9075 return -1;
9076 }
9077 else {
9078 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9079 if (!data)
9080 return -1;
9081 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009082 }
9083
Benjamin Petersonead6b532011-12-20 17:23:42 -06009084 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009086 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009087 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009088 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009089 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009090 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009091 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009092 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009093 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009094 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009095 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009096 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009098 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009099 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009100 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009101 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009102 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009104 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009105 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009106 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009107 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009108 break;
9109 default:
9110 assert(0);
9111 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009113 if (unicode != NULL && thousands_sep_kind != kind) {
9114 if (thousands_sep_kind < kind)
9115 PyMem_Free(thousands_sep_data);
9116 else
9117 PyMem_Free(data);
9118 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009119 if (unicode == NULL) {
9120 *maxchar = 127;
9121 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009122 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009123 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009124 }
9125 }
9126 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127}
9128
9129
Thomas Wouters477c8d52006-05-27 19:21:47 +00009130/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009131#define ADJUST_INDICES(start, end, len) \
9132 if (end > len) \
9133 end = len; \
9134 else if (end < 0) { \
9135 end += len; \
9136 if (end < 0) \
9137 end = 0; \
9138 } \
9139 if (start < 0) { \
9140 start += len; \
9141 if (start < 0) \
9142 start = 0; \
9143 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009144
Alexander Belopolsky40018472011-02-26 01:02:56 +00009145Py_ssize_t
9146PyUnicode_Count(PyObject *str,
9147 PyObject *substr,
9148 Py_ssize_t start,
9149 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009151 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009152 PyObject* str_obj;
9153 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 int kind1, kind2, kind;
9155 void *buf1 = NULL, *buf2 = NULL;
9156 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009157
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009158 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009159 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009161 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009162 if (!sub_obj) {
9163 Py_DECREF(str_obj);
9164 return -1;
9165 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009166 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009167 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 Py_DECREF(str_obj);
9169 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170 }
Tim Petersced69f82003-09-16 20:30:58 +00009171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009172 kind1 = PyUnicode_KIND(str_obj);
9173 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009174 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009177 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009178 if (kind2 > kind) {
9179 Py_DECREF(sub_obj);
9180 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009181 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009182 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009183 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 if (!buf2)
9186 goto onError;
9187 len1 = PyUnicode_GET_LENGTH(str_obj);
9188 len2 = PyUnicode_GET_LENGTH(sub_obj);
9189
9190 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009191 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009193 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9194 result = asciilib_count(
9195 ((Py_UCS1*)buf1) + start, end - start,
9196 buf2, len2, PY_SSIZE_T_MAX
9197 );
9198 else
9199 result = ucs1lib_count(
9200 ((Py_UCS1*)buf1) + start, end - start,
9201 buf2, len2, PY_SSIZE_T_MAX
9202 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 break;
9204 case PyUnicode_2BYTE_KIND:
9205 result = ucs2lib_count(
9206 ((Py_UCS2*)buf1) + start, end - start,
9207 buf2, len2, PY_SSIZE_T_MAX
9208 );
9209 break;
9210 case PyUnicode_4BYTE_KIND:
9211 result = ucs4lib_count(
9212 ((Py_UCS4*)buf1) + start, end - start,
9213 buf2, len2, PY_SSIZE_T_MAX
9214 );
9215 break;
9216 default:
9217 assert(0); result = 0;
9218 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009219
9220 Py_DECREF(sub_obj);
9221 Py_DECREF(str_obj);
9222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 if (kind2 != kind)
9224 PyMem_Free(buf2);
9225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 onError:
9228 Py_DECREF(sub_obj);
9229 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 if (kind2 != kind && buf2)
9231 PyMem_Free(buf2);
9232 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233}
9234
Alexander Belopolsky40018472011-02-26 01:02:56 +00009235Py_ssize_t
9236PyUnicode_Find(PyObject *str,
9237 PyObject *sub,
9238 Py_ssize_t start,
9239 Py_ssize_t end,
9240 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009242 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009243
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009245 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009247 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009248 if (!sub) {
9249 Py_DECREF(str);
9250 return -2;
9251 }
9252 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9253 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 Py_DECREF(str);
9255 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256 }
Tim Petersced69f82003-09-16 20:30:58 +00009257
Victor Stinner794d5672011-10-10 03:21:36 +02009258 result = any_find_slice(direction,
9259 str, sub, start, end
9260 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009261
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009263 Py_DECREF(sub);
9264
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265 return result;
9266}
9267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268Py_ssize_t
9269PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9270 Py_ssize_t start, Py_ssize_t end,
9271 int direction)
9272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009274 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 if (PyUnicode_READY(str) == -1)
9276 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009277 if (start < 0 || end < 0) {
9278 PyErr_SetString(PyExc_IndexError, "string index out of range");
9279 return -2;
9280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 if (end > PyUnicode_GET_LENGTH(str))
9282 end = PyUnicode_GET_LENGTH(str);
9283 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009284 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9285 kind, end-start, ch, direction);
9286 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009288 else
9289 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290}
9291
Alexander Belopolsky40018472011-02-26 01:02:56 +00009292static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009293tailmatch(PyObject *self,
9294 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009295 Py_ssize_t start,
9296 Py_ssize_t end,
9297 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 int kind_self;
9300 int kind_sub;
9301 void *data_self;
9302 void *data_sub;
9303 Py_ssize_t offset;
9304 Py_ssize_t i;
9305 Py_ssize_t end_sub;
9306
9307 if (PyUnicode_READY(self) == -1 ||
9308 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009309 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310
9311 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312 return 1;
9313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9315 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009317 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 kind_self = PyUnicode_KIND(self);
9320 data_self = PyUnicode_DATA(self);
9321 kind_sub = PyUnicode_KIND(substring);
9322 data_sub = PyUnicode_DATA(substring);
9323 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9324
9325 if (direction > 0)
9326 offset = end;
9327 else
9328 offset = start;
9329
9330 if (PyUnicode_READ(kind_self, data_self, offset) ==
9331 PyUnicode_READ(kind_sub, data_sub, 0) &&
9332 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9333 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9334 /* If both are of the same kind, memcmp is sufficient */
9335 if (kind_self == kind_sub) {
9336 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009337 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 data_sub,
9339 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009340 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 }
9342 /* otherwise we have to compare each character by first accesing it */
9343 else {
9344 /* We do not need to compare 0 and len(substring)-1 because
9345 the if statement above ensured already that they are equal
9346 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 for (i = 1; i < end_sub; ++i) {
9348 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9349 PyUnicode_READ(kind_sub, data_sub, i))
9350 return 0;
9351 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 }
9355
9356 return 0;
9357}
9358
Alexander Belopolsky40018472011-02-26 01:02:56 +00009359Py_ssize_t
9360PyUnicode_Tailmatch(PyObject *str,
9361 PyObject *substr,
9362 Py_ssize_t start,
9363 Py_ssize_t end,
9364 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009366 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009367
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 str = PyUnicode_FromObject(str);
9369 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009370 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371 substr = PyUnicode_FromObject(substr);
9372 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 Py_DECREF(str);
9374 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 }
Tim Petersced69f82003-09-16 20:30:58 +00009376
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009377 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009378 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379 Py_DECREF(str);
9380 Py_DECREF(substr);
9381 return result;
9382}
9383
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384/* Apply fixfct filter to the Unicode object self and return a
9385 reference to the modified object */
9386
Alexander Belopolsky40018472011-02-26 01:02:56 +00009387static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009388fixup(PyObject *self,
9389 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 PyObject *u;
9392 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009393 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009395 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009398 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 /* fix functions return the new maximum character in a string,
9401 if the kind of the resulting unicode object does not change,
9402 everything is fine. Otherwise we need to change the string kind
9403 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009404 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009405
9406 if (maxchar_new == 0) {
9407 /* no changes */;
9408 if (PyUnicode_CheckExact(self)) {
9409 Py_DECREF(u);
9410 Py_INCREF(self);
9411 return self;
9412 }
9413 else
9414 return u;
9415 }
9416
Victor Stinnere6abb482012-05-02 01:15:40 +02009417 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418
Victor Stinnereaab6042011-12-11 22:22:39 +01009419 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009421
9422 /* In case the maximum character changed, we need to
9423 convert the string to the new category. */
9424 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9425 if (v == NULL) {
9426 Py_DECREF(u);
9427 return NULL;
9428 }
9429 if (maxchar_new > maxchar_old) {
9430 /* If the maxchar increased so that the kind changed, not all
9431 characters are representable anymore and we need to fix the
9432 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009433 _PyUnicode_FastCopyCharacters(v, 0,
9434 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009435 maxchar_old = fixfct(v);
9436 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 }
9438 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009439 _PyUnicode_FastCopyCharacters(v, 0,
9440 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009442 Py_DECREF(u);
9443 assert(_PyUnicode_CheckConsistency(v, 1));
9444 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445}
9446
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009447static PyObject *
9448ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009450 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9451 char *resdata, *data = PyUnicode_DATA(self);
9452 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009453
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009454 res = PyUnicode_New(len, 127);
9455 if (res == NULL)
9456 return NULL;
9457 resdata = PyUnicode_DATA(res);
9458 if (lower)
9459 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009461 _Py_bytes_upper(resdata, data, len);
9462 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463}
9464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009466handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009468 Py_ssize_t j;
9469 int final_sigma;
9470 Py_UCS4 c;
9471 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009472
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009473 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9474
9475 where ! is a negation and \p{xxx} is a character with property xxx.
9476 */
9477 for (j = i - 1; j >= 0; j--) {
9478 c = PyUnicode_READ(kind, data, j);
9479 if (!_PyUnicode_IsCaseIgnorable(c))
9480 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009482 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9483 if (final_sigma) {
9484 for (j = i + 1; j < length; j++) {
9485 c = PyUnicode_READ(kind, data, j);
9486 if (!_PyUnicode_IsCaseIgnorable(c))
9487 break;
9488 }
9489 final_sigma = j == length || !_PyUnicode_IsCased(c);
9490 }
9491 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492}
9493
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009494static int
9495lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9496 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009498 /* Obscure special case. */
9499 if (c == 0x3A3) {
9500 mapped[0] = handle_capital_sigma(kind, data, length, i);
9501 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009503 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504}
9505
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009506static Py_ssize_t
9507do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009509 Py_ssize_t i, k = 0;
9510 int n_res, j;
9511 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009512
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009513 c = PyUnicode_READ(kind, data, 0);
9514 n_res = _PyUnicode_ToUpperFull(c, mapped);
9515 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009516 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009517 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009519 for (i = 1; i < length; i++) {
9520 c = PyUnicode_READ(kind, data, i);
9521 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9522 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009523 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009524 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009525 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009526 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009527 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528}
9529
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009530static Py_ssize_t
9531do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9532 Py_ssize_t i, k = 0;
9533
9534 for (i = 0; i < length; i++) {
9535 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9536 int n_res, j;
9537 if (Py_UNICODE_ISUPPER(c)) {
9538 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9539 }
9540 else if (Py_UNICODE_ISLOWER(c)) {
9541 n_res = _PyUnicode_ToUpperFull(c, mapped);
9542 }
9543 else {
9544 n_res = 1;
9545 mapped[0] = c;
9546 }
9547 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009548 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009549 res[k++] = mapped[j];
9550 }
9551 }
9552 return k;
9553}
9554
9555static Py_ssize_t
9556do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9557 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009559 Py_ssize_t i, k = 0;
9560
9561 for (i = 0; i < length; i++) {
9562 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9563 int n_res, j;
9564 if (lower)
9565 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9566 else
9567 n_res = _PyUnicode_ToUpperFull(c, mapped);
9568 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009569 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009570 res[k++] = mapped[j];
9571 }
9572 }
9573 return k;
9574}
9575
9576static Py_ssize_t
9577do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9578{
9579 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9580}
9581
9582static Py_ssize_t
9583do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9584{
9585 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9586}
9587
Benjamin Petersone51757f2012-01-12 21:10:29 -05009588static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009589do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9590{
9591 Py_ssize_t i, k = 0;
9592
9593 for (i = 0; i < length; i++) {
9594 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9595 Py_UCS4 mapped[3];
9596 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9597 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009598 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009599 res[k++] = mapped[j];
9600 }
9601 }
9602 return k;
9603}
9604
9605static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009606do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9607{
9608 Py_ssize_t i, k = 0;
9609 int previous_is_cased;
9610
9611 previous_is_cased = 0;
9612 for (i = 0; i < length; i++) {
9613 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9614 Py_UCS4 mapped[3];
9615 int n_res, j;
9616
9617 if (previous_is_cased)
9618 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9619 else
9620 n_res = _PyUnicode_ToTitleFull(c, mapped);
9621
9622 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009623 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009624 res[k++] = mapped[j];
9625 }
9626
9627 previous_is_cased = _PyUnicode_IsCased(c);
9628 }
9629 return k;
9630}
9631
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009632static PyObject *
9633case_operation(PyObject *self,
9634 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9635{
9636 PyObject *res = NULL;
9637 Py_ssize_t length, newlength = 0;
9638 int kind, outkind;
9639 void *data, *outdata;
9640 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9641
Benjamin Petersoneea48462012-01-16 14:28:50 -05009642 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643
9644 kind = PyUnicode_KIND(self);
9645 data = PyUnicode_DATA(self);
9646 length = PyUnicode_GET_LENGTH(self);
9647 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9648 if (tmp == NULL)
9649 return PyErr_NoMemory();
9650 newlength = perform(kind, data, length, tmp, &maxchar);
9651 res = PyUnicode_New(newlength, maxchar);
9652 if (res == NULL)
9653 goto leave;
9654 tmpend = tmp + newlength;
9655 outdata = PyUnicode_DATA(res);
9656 outkind = PyUnicode_KIND(res);
9657 switch (outkind) {
9658 case PyUnicode_1BYTE_KIND:
9659 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9660 break;
9661 case PyUnicode_2BYTE_KIND:
9662 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9663 break;
9664 case PyUnicode_4BYTE_KIND:
9665 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9666 break;
9667 default:
9668 assert(0);
9669 break;
9670 }
9671 leave:
9672 PyMem_FREE(tmp);
9673 return res;
9674}
9675
Tim Peters8ce9f162004-08-27 01:49:32 +00009676PyObject *
9677PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009680 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009682 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009683 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9684 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009685 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009687 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009689 int use_memcpy;
9690 unsigned char *res_data = NULL, *sep_data = NULL;
9691 PyObject *last_obj;
9692 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009694 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009695 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009696 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009697 }
9698
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009699 /* NOTE: the following code can't call back into Python code,
9700 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009701 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009702
Tim Peters05eba1f2004-08-27 21:32:02 +00009703 seqlen = PySequence_Fast_GET_SIZE(fseq);
9704 /* If empty sequence, return u"". */
9705 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009706 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009707 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009708 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009709
Tim Peters05eba1f2004-08-27 21:32:02 +00009710 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009711 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009712 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009713 if (seqlen == 1) {
9714 if (PyUnicode_CheckExact(items[0])) {
9715 res = items[0];
9716 Py_INCREF(res);
9717 Py_DECREF(fseq);
9718 return res;
9719 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009720 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009721 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009722 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009723 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009724 /* Set up sep and seplen */
9725 if (separator == NULL) {
9726 /* fall back to a blank space separator */
9727 sep = PyUnicode_FromOrdinal(' ');
9728 if (!sep)
9729 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009730 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009731 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009732 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009733 else {
9734 if (!PyUnicode_Check(separator)) {
9735 PyErr_Format(PyExc_TypeError,
9736 "separator: expected str instance,"
9737 " %.80s found",
9738 Py_TYPE(separator)->tp_name);
9739 goto onError;
9740 }
9741 if (PyUnicode_READY(separator))
9742 goto onError;
9743 sep = separator;
9744 seplen = PyUnicode_GET_LENGTH(separator);
9745 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9746 /* inc refcount to keep this code path symmetric with the
9747 above case of a blank separator */
9748 Py_INCREF(sep);
9749 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009750 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009751 }
9752
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009753 /* There are at least two things to join, or else we have a subclass
9754 * of str in the sequence.
9755 * Do a pre-pass to figure out the total amount of space we'll
9756 * need (sz), and see whether all argument are strings.
9757 */
9758 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009759#ifdef Py_DEBUG
9760 use_memcpy = 0;
9761#else
9762 use_memcpy = 1;
9763#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009764 for (i = 0; i < seqlen; i++) {
9765 const Py_ssize_t old_sz = sz;
9766 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009767 if (!PyUnicode_Check(item)) {
9768 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009769 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009770 " %.80s found",
9771 i, Py_TYPE(item)->tp_name);
9772 goto onError;
9773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 if (PyUnicode_READY(item) == -1)
9775 goto onError;
9776 sz += PyUnicode_GET_LENGTH(item);
9777 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009778 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009779 if (i != 0)
9780 sz += seplen;
9781 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9782 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009783 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009784 goto onError;
9785 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009786 if (use_memcpy && last_obj != NULL) {
9787 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9788 use_memcpy = 0;
9789 }
9790 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009791 }
Tim Petersced69f82003-09-16 20:30:58 +00009792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009794 if (res == NULL)
9795 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009796
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009797 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009798#ifdef Py_DEBUG
9799 use_memcpy = 0;
9800#else
9801 if (use_memcpy) {
9802 res_data = PyUnicode_1BYTE_DATA(res);
9803 kind = PyUnicode_KIND(res);
9804 if (seplen != 0)
9805 sep_data = PyUnicode_1BYTE_DATA(sep);
9806 }
9807#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009808 if (use_memcpy) {
9809 for (i = 0; i < seqlen; ++i) {
9810 Py_ssize_t itemlen;
9811 item = items[i];
9812
9813 /* Copy item, and maybe the separator. */
9814 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009815 Py_MEMCPY(res_data,
9816 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009817 kind * seplen);
9818 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009819 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009820
9821 itemlen = PyUnicode_GET_LENGTH(item);
9822 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009823 Py_MEMCPY(res_data,
9824 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009825 kind * itemlen);
9826 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009827 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009828 }
9829 assert(res_data == PyUnicode_1BYTE_DATA(res)
9830 + kind * PyUnicode_GET_LENGTH(res));
9831 }
9832 else {
9833 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9834 Py_ssize_t itemlen;
9835 item = items[i];
9836
9837 /* Copy item, and maybe the separator. */
9838 if (i && seplen != 0) {
9839 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9840 res_offset += seplen;
9841 }
9842
9843 itemlen = PyUnicode_GET_LENGTH(item);
9844 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009845 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009846 res_offset += itemlen;
9847 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009848 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009849 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009850 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009851
Tim Peters05eba1f2004-08-27 21:32:02 +00009852 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856
Benjamin Peterson29060642009-01-31 22:14:21 +00009857 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009858 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009860 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861 return NULL;
9862}
9863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864#define FILL(kind, data, value, start, length) \
9865 do { \
9866 Py_ssize_t i_ = 0; \
9867 assert(kind != PyUnicode_WCHAR_KIND); \
9868 switch ((kind)) { \
9869 case PyUnicode_1BYTE_KIND: { \
9870 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009871 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 break; \
9873 } \
9874 case PyUnicode_2BYTE_KIND: { \
9875 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9876 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9877 break; \
9878 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009879 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9881 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9882 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009883 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 } \
9885 } \
9886 } while (0)
9887
Victor Stinnerd3f08822012-05-29 12:57:52 +02009888void
9889_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9890 Py_UCS4 fill_char)
9891{
9892 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9893 const void *data = PyUnicode_DATA(unicode);
9894 assert(PyUnicode_IS_READY(unicode));
9895 assert(unicode_modifiable(unicode));
9896 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9897 assert(start >= 0);
9898 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9899 FILL(kind, data, fill_char, start, length);
9900}
9901
Victor Stinner3fe55312012-01-04 00:33:50 +01009902Py_ssize_t
9903PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9904 Py_UCS4 fill_char)
9905{
9906 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009907
9908 if (!PyUnicode_Check(unicode)) {
9909 PyErr_BadInternalCall();
9910 return -1;
9911 }
9912 if (PyUnicode_READY(unicode) == -1)
9913 return -1;
9914 if (unicode_check_modifiable(unicode))
9915 return -1;
9916
Victor Stinnerd3f08822012-05-29 12:57:52 +02009917 if (start < 0) {
9918 PyErr_SetString(PyExc_IndexError, "string index out of range");
9919 return -1;
9920 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009921 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9922 PyErr_SetString(PyExc_ValueError,
9923 "fill character is bigger than "
9924 "the string maximum character");
9925 return -1;
9926 }
9927
9928 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9929 length = Py_MIN(maxlen, length);
9930 if (length <= 0)
9931 return 0;
9932
Victor Stinnerd3f08822012-05-29 12:57:52 +02009933 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009934 return length;
9935}
9936
Victor Stinner9310abb2011-10-05 00:59:23 +02009937static PyObject *
9938pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009939 Py_ssize_t left,
9940 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 PyObject *u;
9944 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009945 int kind;
9946 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947
9948 if (left < 0)
9949 left = 0;
9950 if (right < 0)
9951 right = 0;
9952
Victor Stinnerc4b49542011-12-11 22:44:26 +01009953 if (left == 0 && right == 0)
9954 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9957 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009958 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9959 return NULL;
9960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009962 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009964 if (!u)
9965 return NULL;
9966
9967 kind = PyUnicode_KIND(u);
9968 data = PyUnicode_DATA(u);
9969 if (left)
9970 FILL(kind, data, fill, 0, left);
9971 if (right)
9972 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009973 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009974 assert(_PyUnicode_CheckConsistency(u, 1));
9975 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976}
9977
Alexander Belopolsky40018472011-02-26 01:02:56 +00009978PyObject *
9979PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982
9983 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009984 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009986 if (PyUnicode_READY(string) == -1) {
9987 Py_DECREF(string);
9988 return NULL;
9989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990
Benjamin Petersonead6b532011-12-20 17:23:42 -06009991 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009993 if (PyUnicode_IS_ASCII(string))
9994 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009995 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009996 PyUnicode_GET_LENGTH(string), keepends);
9997 else
9998 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009999 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010000 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 break;
10002 case PyUnicode_2BYTE_KIND:
10003 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010004 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 PyUnicode_GET_LENGTH(string), keepends);
10006 break;
10007 case PyUnicode_4BYTE_KIND:
10008 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010009 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 PyUnicode_GET_LENGTH(string), keepends);
10011 break;
10012 default:
10013 assert(0);
10014 list = 0;
10015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 Py_DECREF(string);
10017 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018}
10019
Alexander Belopolsky40018472011-02-26 01:02:56 +000010020static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010021split(PyObject *self,
10022 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010023 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 int kind1, kind2, kind;
10026 void *buf1, *buf2;
10027 Py_ssize_t len1, len2;
10028 PyObject* out;
10029
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010031 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 if (PyUnicode_READY(self) == -1)
10034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010037 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010039 if (PyUnicode_IS_ASCII(self))
10040 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010041 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010042 PyUnicode_GET_LENGTH(self), maxcount
10043 );
10044 else
10045 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010046 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010047 PyUnicode_GET_LENGTH(self), maxcount
10048 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 case PyUnicode_2BYTE_KIND:
10050 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010051 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 PyUnicode_GET_LENGTH(self), maxcount
10053 );
10054 case PyUnicode_4BYTE_KIND:
10055 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010056 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 PyUnicode_GET_LENGTH(self), maxcount
10058 );
10059 default:
10060 assert(0);
10061 return NULL;
10062 }
10063
10064 if (PyUnicode_READY(substring) == -1)
10065 return NULL;
10066
10067 kind1 = PyUnicode_KIND(self);
10068 kind2 = PyUnicode_KIND(substring);
10069 kind = kind1 > kind2 ? kind1 : kind2;
10070 buf1 = PyUnicode_DATA(self);
10071 buf2 = PyUnicode_DATA(substring);
10072 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010073 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 if (!buf1)
10075 return NULL;
10076 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010077 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 if (!buf2) {
10079 if (kind1 != kind) PyMem_Free(buf1);
10080 return NULL;
10081 }
10082 len1 = PyUnicode_GET_LENGTH(self);
10083 len2 = PyUnicode_GET_LENGTH(substring);
10084
Benjamin Petersonead6b532011-12-20 17:23:42 -060010085 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010087 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10088 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010089 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010090 else
10091 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010092 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 break;
10094 case PyUnicode_2BYTE_KIND:
10095 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010096 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 break;
10098 case PyUnicode_4BYTE_KIND:
10099 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010100 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 break;
10102 default:
10103 out = NULL;
10104 }
10105 if (kind1 != kind)
10106 PyMem_Free(buf1);
10107 if (kind2 != kind)
10108 PyMem_Free(buf2);
10109 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110}
10111
Alexander Belopolsky40018472011-02-26 01:02:56 +000010112static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010113rsplit(PyObject *self,
10114 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010115 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 int kind1, kind2, kind;
10118 void *buf1, *buf2;
10119 Py_ssize_t len1, len2;
10120 PyObject* out;
10121
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010122 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010123 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 if (PyUnicode_READY(self) == -1)
10126 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010129 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010131 if (PyUnicode_IS_ASCII(self))
10132 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010133 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010134 PyUnicode_GET_LENGTH(self), maxcount
10135 );
10136 else
10137 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010138 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010139 PyUnicode_GET_LENGTH(self), maxcount
10140 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 case PyUnicode_2BYTE_KIND:
10142 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010143 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 PyUnicode_GET_LENGTH(self), maxcount
10145 );
10146 case PyUnicode_4BYTE_KIND:
10147 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010148 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 PyUnicode_GET_LENGTH(self), maxcount
10150 );
10151 default:
10152 assert(0);
10153 return NULL;
10154 }
10155
10156 if (PyUnicode_READY(substring) == -1)
10157 return NULL;
10158
10159 kind1 = PyUnicode_KIND(self);
10160 kind2 = PyUnicode_KIND(substring);
10161 kind = kind1 > kind2 ? kind1 : kind2;
10162 buf1 = PyUnicode_DATA(self);
10163 buf2 = PyUnicode_DATA(substring);
10164 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010165 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 if (!buf1)
10167 return NULL;
10168 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010169 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 if (!buf2) {
10171 if (kind1 != kind) PyMem_Free(buf1);
10172 return NULL;
10173 }
10174 len1 = PyUnicode_GET_LENGTH(self);
10175 len2 = PyUnicode_GET_LENGTH(substring);
10176
Benjamin Petersonead6b532011-12-20 17:23:42 -060010177 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010179 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10180 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010181 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 else
10183 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010184 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 break;
10186 case PyUnicode_2BYTE_KIND:
10187 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 break;
10190 case PyUnicode_4BYTE_KIND:
10191 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 break;
10194 default:
10195 out = NULL;
10196 }
10197 if (kind1 != kind)
10198 PyMem_Free(buf1);
10199 if (kind2 != kind)
10200 PyMem_Free(buf2);
10201 return out;
10202}
10203
10204static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010205anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10206 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010208 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10211 return asciilib_find(buf1, len1, buf2, len2, offset);
10212 else
10213 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 case PyUnicode_2BYTE_KIND:
10215 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10216 case PyUnicode_4BYTE_KIND:
10217 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10218 }
10219 assert(0);
10220 return -1;
10221}
10222
10223static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10225 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010227 switch (kind) {
10228 case PyUnicode_1BYTE_KIND:
10229 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10230 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10231 else
10232 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10233 case PyUnicode_2BYTE_KIND:
10234 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10235 case PyUnicode_4BYTE_KIND:
10236 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10237 }
10238 assert(0);
10239 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010240}
10241
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010242static void
10243replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10244 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10245{
10246 int kind = PyUnicode_KIND(u);
10247 void *data = PyUnicode_DATA(u);
10248 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10249 if (kind == PyUnicode_1BYTE_KIND) {
10250 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10251 (Py_UCS1 *)data + len,
10252 u1, u2, maxcount);
10253 }
10254 else if (kind == PyUnicode_2BYTE_KIND) {
10255 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10256 (Py_UCS2 *)data + len,
10257 u1, u2, maxcount);
10258 }
10259 else {
10260 assert(kind == PyUnicode_4BYTE_KIND);
10261 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10262 (Py_UCS4 *)data + len,
10263 u1, u2, maxcount);
10264 }
10265}
10266
Alexander Belopolsky40018472011-02-26 01:02:56 +000010267static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268replace(PyObject *self, PyObject *str1,
10269 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 PyObject *u;
10272 char *sbuf = PyUnicode_DATA(self);
10273 char *buf1 = PyUnicode_DATA(str1);
10274 char *buf2 = PyUnicode_DATA(str2);
10275 int srelease = 0, release1 = 0, release2 = 0;
10276 int skind = PyUnicode_KIND(self);
10277 int kind1 = PyUnicode_KIND(str1);
10278 int kind2 = PyUnicode_KIND(str2);
10279 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10280 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10281 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010282 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010283 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
10285 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010286 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010288 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289
Victor Stinner59de0ee2011-10-07 10:01:28 +020010290 if (str1 == str2)
10291 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292
Victor Stinner49a0a212011-10-12 23:46:10 +020010293 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010294 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10295 if (maxchar < maxchar_str1)
10296 /* substring too wide to be present */
10297 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010298 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10299 /* Replacing str1 with str2 may cause a maxchar reduction in the
10300 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010301 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010302 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010305 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010307 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010309 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010310 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010311 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010312
Victor Stinner69ed0f42013-04-09 21:48:24 +020010313 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010314 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010315 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010316 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010317 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010321
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010322 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10323 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010324 }
10325 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 int rkind = skind;
10327 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010328 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 if (kind1 < rkind) {
10331 /* widen substring */
10332 buf1 = _PyUnicode_AsKind(str1, rkind);
10333 if (!buf1) goto error;
10334 release1 = 1;
10335 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010336 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010337 if (i < 0)
10338 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (rkind > kind2) {
10340 /* widen replacement */
10341 buf2 = _PyUnicode_AsKind(str2, rkind);
10342 if (!buf2) goto error;
10343 release2 = 1;
10344 }
10345 else if (rkind < kind2) {
10346 /* widen self and buf1 */
10347 rkind = kind2;
10348 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010349 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 sbuf = _PyUnicode_AsKind(self, rkind);
10351 if (!sbuf) goto error;
10352 srelease = 1;
10353 buf1 = _PyUnicode_AsKind(str1, rkind);
10354 if (!buf1) goto error;
10355 release1 = 1;
10356 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010357 u = PyUnicode_New(slen, maxchar);
10358 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010360 assert(PyUnicode_KIND(u) == rkind);
10361 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010362
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010363 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010364 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010365 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010367 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010369
10370 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010371 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010372 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010373 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010374 if (i == -1)
10375 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010376 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010378 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010382 }
10383 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010385 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 int rkind = skind;
10387 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010390 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 buf1 = _PyUnicode_AsKind(str1, rkind);
10392 if (!buf1) goto error;
10393 release1 = 1;
10394 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010395 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010396 if (n == 0)
10397 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010399 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 buf2 = _PyUnicode_AsKind(str2, rkind);
10401 if (!buf2) goto error;
10402 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010405 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 rkind = kind2;
10407 sbuf = _PyUnicode_AsKind(self, rkind);
10408 if (!sbuf) goto error;
10409 srelease = 1;
10410 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010411 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 buf1 = _PyUnicode_AsKind(str1, rkind);
10413 if (!buf1) goto error;
10414 release1 = 1;
10415 }
10416 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10417 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010418 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 PyErr_SetString(PyExc_OverflowError,
10420 "replace string is too long");
10421 goto error;
10422 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010423 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010424 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010425 _Py_INCREF_UNICODE_EMPTY();
10426 if (!unicode_empty)
10427 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010428 u = unicode_empty;
10429 goto done;
10430 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010431 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 PyErr_SetString(PyExc_OverflowError,
10433 "replace string is too long");
10434 goto error;
10435 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010436 u = PyUnicode_New(new_size, maxchar);
10437 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010439 assert(PyUnicode_KIND(u) == rkind);
10440 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 ires = i = 0;
10442 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010443 while (n-- > 0) {
10444 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010445 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010446 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010447 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010448 if (j == -1)
10449 break;
10450 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010452 memcpy(res + rkind * ires,
10453 sbuf + rkind * i,
10454 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010456 }
10457 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010459 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010461 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010467 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010468 memcpy(res + rkind * ires,
10469 sbuf + rkind * i,
10470 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010471 }
10472 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473 /* interleave */
10474 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010477 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010479 if (--n <= 0)
10480 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010481 memcpy(res + rkind * ires,
10482 sbuf + rkind * i,
10483 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 ires++;
10485 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010486 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010487 memcpy(res + rkind * ires,
10488 sbuf + rkind * i,
10489 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010490 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010491 }
10492
10493 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010494 unicode_adjust_maxchar(&u);
10495 if (u == NULL)
10496 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010498
10499 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 if (srelease)
10501 PyMem_FREE(sbuf);
10502 if (release1)
10503 PyMem_FREE(buf1);
10504 if (release2)
10505 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010506 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508
Benjamin Peterson29060642009-01-31 22:14:21 +000010509 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010510 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (srelease)
10512 PyMem_FREE(sbuf);
10513 if (release1)
10514 PyMem_FREE(buf1);
10515 if (release2)
10516 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010517 return unicode_result_unchanged(self);
10518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 error:
10520 if (srelease && sbuf)
10521 PyMem_FREE(sbuf);
10522 if (release1 && buf1)
10523 PyMem_FREE(buf1);
10524 if (release2 && buf2)
10525 PyMem_FREE(buf2);
10526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527}
10528
10529/* --- Unicode Object Methods --------------------------------------------- */
10530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010531PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533\n\
10534Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010535characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536
10537static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010538unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010540 if (PyUnicode_READY(self) == -1)
10541 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010542 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543}
10544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010545PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547\n\
10548Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010549have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550
10551static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010552unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010554 if (PyUnicode_READY(self) == -1)
10555 return NULL;
10556 if (PyUnicode_GET_LENGTH(self) == 0)
10557 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010558 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559}
10560
Benjamin Petersond5890c82012-01-14 13:23:30 -050010561PyDoc_STRVAR(casefold__doc__,
10562 "S.casefold() -> str\n\
10563\n\
10564Return a version of S suitable for caseless comparisons.");
10565
10566static PyObject *
10567unicode_casefold(PyObject *self)
10568{
10569 if (PyUnicode_READY(self) == -1)
10570 return NULL;
10571 if (PyUnicode_IS_ASCII(self))
10572 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010573 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010574}
10575
10576
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010577/* Argument converter. Coerces to a single unicode character */
10578
10579static int
10580convert_uc(PyObject *obj, void *addr)
10581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010583 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010584
Benjamin Peterson14339b62009-01-31 16:36:08 +000010585 uniobj = PyUnicode_FromObject(obj);
10586 if (uniobj == NULL) {
10587 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010588 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010589 return 0;
10590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010592 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010593 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010594 Py_DECREF(uniobj);
10595 return 0;
10596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010598 Py_DECREF(uniobj);
10599 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010600}
10601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010602PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010603 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010605Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010606done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607
10608static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010609unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010611 Py_ssize_t marg, left;
10612 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 Py_UCS4 fillchar = ' ';
10614
Victor Stinnere9a29352011-10-01 02:14:59 +020010615 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Benjamin Petersonbac79492012-01-14 13:34:47 -050010618 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 return NULL;
10620
Victor Stinnerc4b49542011-12-11 22:44:26 +010010621 if (PyUnicode_GET_LENGTH(self) >= width)
10622 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623
Victor Stinnerc4b49542011-12-11 22:44:26 +010010624 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625 left = marg / 2 + (marg & width & 1);
10626
Victor Stinner9310abb2011-10-05 00:59:23 +020010627 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628}
10629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630/* This function assumes that str1 and str2 are readied by the caller. */
10631
Marc-André Lemburge5034372000-08-08 08:04:29 +000010632static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010633unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010634{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010635#define COMPARE(TYPE1, TYPE2) \
10636 do { \
10637 TYPE1* p1 = (TYPE1 *)data1; \
10638 TYPE2* p2 = (TYPE2 *)data2; \
10639 TYPE1* end = p1 + len; \
10640 Py_UCS4 c1, c2; \
10641 for (; p1 != end; p1++, p2++) { \
10642 c1 = *p1; \
10643 c2 = *p2; \
10644 if (c1 != c2) \
10645 return (c1 < c2) ? -1 : 1; \
10646 } \
10647 } \
10648 while (0)
10649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 int kind1, kind2;
10651 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010652 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 kind1 = PyUnicode_KIND(str1);
10655 kind2 = PyUnicode_KIND(str2);
10656 data1 = PyUnicode_DATA(str1);
10657 data2 = PyUnicode_DATA(str2);
10658 len1 = PyUnicode_GET_LENGTH(str1);
10659 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010660 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010661
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010662 switch(kind1) {
10663 case PyUnicode_1BYTE_KIND:
10664 {
10665 switch(kind2) {
10666 case PyUnicode_1BYTE_KIND:
10667 {
10668 int cmp = memcmp(data1, data2, len);
10669 /* normalize result of memcmp() into the range [-1; 1] */
10670 if (cmp < 0)
10671 return -1;
10672 if (cmp > 0)
10673 return 1;
10674 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010675 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010676 case PyUnicode_2BYTE_KIND:
10677 COMPARE(Py_UCS1, Py_UCS2);
10678 break;
10679 case PyUnicode_4BYTE_KIND:
10680 COMPARE(Py_UCS1, Py_UCS4);
10681 break;
10682 default:
10683 assert(0);
10684 }
10685 break;
10686 }
10687 case PyUnicode_2BYTE_KIND:
10688 {
10689 switch(kind2) {
10690 case PyUnicode_1BYTE_KIND:
10691 COMPARE(Py_UCS2, Py_UCS1);
10692 break;
10693 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010694 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010695 COMPARE(Py_UCS2, Py_UCS2);
10696 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010697 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010698 case PyUnicode_4BYTE_KIND:
10699 COMPARE(Py_UCS2, Py_UCS4);
10700 break;
10701 default:
10702 assert(0);
10703 }
10704 break;
10705 }
10706 case PyUnicode_4BYTE_KIND:
10707 {
10708 switch(kind2) {
10709 case PyUnicode_1BYTE_KIND:
10710 COMPARE(Py_UCS4, Py_UCS1);
10711 break;
10712 case PyUnicode_2BYTE_KIND:
10713 COMPARE(Py_UCS4, Py_UCS2);
10714 break;
10715 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010716 {
10717#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10718 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10719 /* normalize result of wmemcmp() into the range [-1; 1] */
10720 if (cmp < 0)
10721 return -1;
10722 if (cmp > 0)
10723 return 1;
10724#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010725 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010726#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010727 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010728 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010729 default:
10730 assert(0);
10731 }
10732 break;
10733 }
10734 default:
10735 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010736 }
10737
Victor Stinner770e19e2012-10-04 22:59:45 +020010738 if (len1 == len2)
10739 return 0;
10740 if (len1 < len2)
10741 return -1;
10742 else
10743 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010744
10745#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010746}
10747
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010748Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010749unicode_compare_eq(PyObject *str1, PyObject *str2)
10750{
10751 int kind;
10752 void *data1, *data2;
10753 Py_ssize_t len;
10754 int cmp;
10755
Victor Stinnere5567ad2012-10-23 02:48:49 +020010756 len = PyUnicode_GET_LENGTH(str1);
10757 if (PyUnicode_GET_LENGTH(str2) != len)
10758 return 0;
10759 kind = PyUnicode_KIND(str1);
10760 if (PyUnicode_KIND(str2) != kind)
10761 return 0;
10762 data1 = PyUnicode_DATA(str1);
10763 data2 = PyUnicode_DATA(str2);
10764
10765 cmp = memcmp(data1, data2, len * kind);
10766 return (cmp == 0);
10767}
10768
10769
Alexander Belopolsky40018472011-02-26 01:02:56 +000010770int
10771PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10774 if (PyUnicode_READY(left) == -1 ||
10775 PyUnicode_READY(right) == -1)
10776 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010777
10778 /* a string is equal to itself */
10779 if (left == right)
10780 return 0;
10781
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010782 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010784 PyErr_Format(PyExc_TypeError,
10785 "Can't compare %.100s and %.100s",
10786 left->ob_type->tp_name,
10787 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788 return -1;
10789}
10790
Martin v. Löwis5b222132007-06-10 09:51:05 +000010791int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010792_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10793{
10794 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10795 if (right_str == NULL)
10796 return -1;
10797 return PyUnicode_Compare(left, right_str);
10798}
10799
10800int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010801PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 Py_ssize_t i;
10804 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 Py_UCS4 chr;
10806
Victor Stinner910337b2011-10-03 03:20:16 +020010807 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 if (PyUnicode_READY(uni) == -1)
10809 return -1;
10810 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010811 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010812 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010813 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010814 size_t len, len2 = strlen(str);
10815 int cmp;
10816
10817 len = Py_MIN(len1, len2);
10818 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010819 if (cmp != 0) {
10820 if (cmp < 0)
10821 return -1;
10822 else
10823 return 1;
10824 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010825 if (len1 > len2)
10826 return 1; /* uni is longer */
10827 if (len2 > len1)
10828 return -1; /* str is longer */
10829 return 0;
10830 }
10831 else {
10832 void *data = PyUnicode_DATA(uni);
10833 /* Compare Unicode string and source character set string */
10834 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10835 if (chr != str[i])
10836 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10837 /* This check keeps Python strings that end in '\0' from comparing equal
10838 to C strings identical up to that point. */
10839 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10840 return 1; /* uni is longer */
10841 if (str[i])
10842 return -1; /* str is longer */
10843 return 0;
10844 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010845}
10846
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010847
Benjamin Peterson29060642009-01-31 22:14:21 +000010848#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010849 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010850
Alexander Belopolsky40018472011-02-26 01:02:56 +000010851PyObject *
10852PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010853{
10854 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010855 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010856
Victor Stinnere5567ad2012-10-23 02:48:49 +020010857 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10858 Py_RETURN_NOTIMPLEMENTED;
10859
10860 if (PyUnicode_READY(left) == -1 ||
10861 PyUnicode_READY(right) == -1)
10862 return NULL;
10863
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010864 if (left == right) {
10865 switch (op) {
10866 case Py_EQ:
10867 case Py_LE:
10868 case Py_GE:
10869 /* a string is equal to itself */
10870 v = Py_True;
10871 break;
10872 case Py_NE:
10873 case Py_LT:
10874 case Py_GT:
10875 v = Py_False;
10876 break;
10877 default:
10878 PyErr_BadArgument();
10879 return NULL;
10880 }
10881 }
10882 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010883 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010884 result ^= (op == Py_NE);
10885 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010886 }
10887 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010888 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010889
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010890 /* Convert the return value to a Boolean */
10891 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010892 case Py_LE:
10893 v = TEST_COND(result <= 0);
10894 break;
10895 case Py_GE:
10896 v = TEST_COND(result >= 0);
10897 break;
10898 case Py_LT:
10899 v = TEST_COND(result == -1);
10900 break;
10901 case Py_GT:
10902 v = TEST_COND(result == 1);
10903 break;
10904 default:
10905 PyErr_BadArgument();
10906 return NULL;
10907 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010908 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010909 Py_INCREF(v);
10910 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010911}
10912
Alexander Belopolsky40018472011-02-26 01:02:56 +000010913int
10914PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010915{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010916 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010917 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 void *buf1, *buf2;
10919 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010920 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010921
10922 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010923 sub = PyUnicode_FromObject(element);
10924 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010925 PyErr_Format(PyExc_TypeError,
10926 "'in <string>' requires string as left operand, not %s",
10927 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010928 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010929 }
10930
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010932 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010933 Py_DECREF(sub);
10934 return -1;
10935 }
10936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 kind1 = PyUnicode_KIND(str);
10938 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 buf1 = PyUnicode_DATA(str);
10940 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010941 if (kind2 != kind1) {
10942 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010943 Py_DECREF(sub);
10944 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010945 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010946 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010947 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 if (!buf2) {
10950 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010951 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 return -1;
10953 }
10954 len1 = PyUnicode_GET_LENGTH(str);
10955 len2 = PyUnicode_GET_LENGTH(sub);
10956
Victor Stinner77282cb2013-04-14 19:22:47 +020010957 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 case PyUnicode_1BYTE_KIND:
10959 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10960 break;
10961 case PyUnicode_2BYTE_KIND:
10962 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10963 break;
10964 case PyUnicode_4BYTE_KIND:
10965 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10966 break;
10967 default:
10968 result = -1;
10969 assert(0);
10970 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010971
10972 Py_DECREF(str);
10973 Py_DECREF(sub);
10974
Victor Stinner77282cb2013-04-14 19:22:47 +020010975 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 PyMem_Free(buf2);
10977
Guido van Rossum403d68b2000-03-13 15:55:09 +000010978 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010979}
10980
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981/* Concat to string or Unicode object giving a new Unicode object. */
10982
Alexander Belopolsky40018472011-02-26 01:02:56 +000010983PyObject *
10984PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010987 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010988 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989
10990 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997
10998 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010999 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011003 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 }
11007
Victor Stinner488fa492011-12-12 00:01:39 +010011008 u_len = PyUnicode_GET_LENGTH(u);
11009 v_len = PyUnicode_GET_LENGTH(v);
11010 if (u_len > PY_SSIZE_T_MAX - v_len) {
11011 PyErr_SetString(PyExc_OverflowError,
11012 "strings are too large to concat");
11013 goto onError;
11014 }
11015 new_len = u_len + v_len;
11016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011018 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011019 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011022 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011025 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11026 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 Py_DECREF(u);
11028 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011029 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 Py_XDECREF(u);
11034 Py_XDECREF(v);
11035 return NULL;
11036}
11037
Walter Dörwald1ab83302007-05-18 17:15:44 +000011038void
Victor Stinner23e56682011-10-03 03:54:37 +020011039PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011040{
Victor Stinner23e56682011-10-03 03:54:37 +020011041 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011042 Py_UCS4 maxchar, maxchar2;
11043 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011044
11045 if (p_left == NULL) {
11046 if (!PyErr_Occurred())
11047 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011048 return;
11049 }
Victor Stinner23e56682011-10-03 03:54:37 +020011050 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011051 if (right == NULL || left == NULL
11052 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011053 if (!PyErr_Occurred())
11054 PyErr_BadInternalCall();
11055 goto error;
11056 }
11057
Benjamin Petersonbac79492012-01-14 13:34:47 -050011058 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011059 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011060 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011061 goto error;
11062
Victor Stinner488fa492011-12-12 00:01:39 +010011063 /* Shortcuts */
11064 if (left == unicode_empty) {
11065 Py_DECREF(left);
11066 Py_INCREF(right);
11067 *p_left = right;
11068 return;
11069 }
11070 if (right == unicode_empty)
11071 return;
11072
11073 left_len = PyUnicode_GET_LENGTH(left);
11074 right_len = PyUnicode_GET_LENGTH(right);
11075 if (left_len > PY_SSIZE_T_MAX - right_len) {
11076 PyErr_SetString(PyExc_OverflowError,
11077 "strings are too large to concat");
11078 goto error;
11079 }
11080 new_len = left_len + right_len;
11081
11082 if (unicode_modifiable(left)
11083 && PyUnicode_CheckExact(right)
11084 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011085 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11086 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011087 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011088 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011089 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11090 {
11091 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011092 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011093 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011094
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011095 /* copy 'right' into the newly allocated area of 'left' */
11096 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011097 }
Victor Stinner488fa492011-12-12 00:01:39 +010011098 else {
11099 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11100 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011101 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011102
Victor Stinner488fa492011-12-12 00:01:39 +010011103 /* Concat the two Unicode strings */
11104 res = PyUnicode_New(new_len, maxchar);
11105 if (res == NULL)
11106 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011107 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11108 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011109 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011110 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011111 }
11112 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011113 return;
11114
11115error:
Victor Stinner488fa492011-12-12 00:01:39 +010011116 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011117}
11118
11119void
11120PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11121{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011122 PyUnicode_Append(pleft, right);
11123 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011124}
11125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011126PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011127 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011129Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011130string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011131interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132
11133static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011134unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011136 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011137 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011138 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 int kind1, kind2, kind;
11141 void *buf1, *buf2;
11142 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143
Jesus Ceaac451502011-04-20 17:09:23 +020011144 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11145 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 kind1 = PyUnicode_KIND(self);
11149 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011150 if (kind2 > kind1) {
11151 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011152 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011153 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011154 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 buf1 = PyUnicode_DATA(self);
11156 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011158 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if (!buf2) {
11160 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 return NULL;
11162 }
11163 len1 = PyUnicode_GET_LENGTH(self);
11164 len2 = PyUnicode_GET_LENGTH(substring);
11165
11166 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011167 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 case PyUnicode_1BYTE_KIND:
11169 iresult = ucs1lib_count(
11170 ((Py_UCS1*)buf1) + start, end - start,
11171 buf2, len2, PY_SSIZE_T_MAX
11172 );
11173 break;
11174 case PyUnicode_2BYTE_KIND:
11175 iresult = ucs2lib_count(
11176 ((Py_UCS2*)buf1) + start, end - start,
11177 buf2, len2, PY_SSIZE_T_MAX
11178 );
11179 break;
11180 case PyUnicode_4BYTE_KIND:
11181 iresult = ucs4lib_count(
11182 ((Py_UCS4*)buf1) + start, end - start,
11183 buf2, len2, PY_SSIZE_T_MAX
11184 );
11185 break;
11186 default:
11187 assert(0); iresult = 0;
11188 }
11189
11190 result = PyLong_FromSsize_t(iresult);
11191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 if (kind2 != kind)
11193 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194
11195 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011196
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197 return result;
11198}
11199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011200PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011201 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011203Encode S using the codec registered for encoding. Default encoding\n\
11204is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011205handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011206a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11207'xmlcharrefreplace' as well as any other name registered with\n\
11208codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209
11210static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011211unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011213 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214 char *encoding = NULL;
11215 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011216
Benjamin Peterson308d6372009-09-18 21:42:35 +000011217 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11218 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011220 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011221}
11222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011223PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011224 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225\n\
11226Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228
11229static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011230unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011232 Py_ssize_t i, j, line_pos, src_len, incr;
11233 Py_UCS4 ch;
11234 PyObject *u;
11235 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011236 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011238 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011239 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
Ezio Melotti745d54d2013-11-16 19:10:57 +020011241 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11242 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244
Antoine Pitrou22425222011-10-04 19:10:51 +020011245 if (PyUnicode_READY(self) == -1)
11246 return NULL;
11247
Thomas Wouters7e474022000-07-16 12:04:32 +000011248 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011249 src_len = PyUnicode_GET_LENGTH(self);
11250 i = j = line_pos = 0;
11251 kind = PyUnicode_KIND(self);
11252 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011253 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011254 for (; i < src_len; i++) {
11255 ch = PyUnicode_READ(kind, src_data, i);
11256 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011257 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011259 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011261 goto overflow;
11262 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011264 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011268 goto overflow;
11269 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011271 if (ch == '\n' || ch == '\r')
11272 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011274 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011275 if (!found)
11276 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011277
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011279 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 if (!u)
11281 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
Antoine Pitroue71d5742011-10-04 15:55:09 +020011284 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
Antoine Pitroue71d5742011-10-04 15:55:09 +020011286 for (; i < src_len; i++) {
11287 ch = PyUnicode_READ(kind, src_data, i);
11288 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011289 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011290 incr = tabsize - (line_pos % tabsize);
11291 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011292 FILL(kind, dest_data, ' ', j, incr);
11293 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011295 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 line_pos++;
11298 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011299 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 if (ch == '\n' || ch == '\r')
11301 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011303 }
11304 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011305 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011306
Antoine Pitroue71d5742011-10-04 15:55:09 +020011307 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011308 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310}
11311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011312PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011313 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314\n\
11315Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011316such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317arguments start and end are interpreted as in slice notation.\n\
11318\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011319Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
11321static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011324 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011325 Py_ssize_t start;
11326 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011327 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328
Jesus Ceaac451502011-04-20 17:09:23 +020011329 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11330 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332
Christian Heimesd47802e2013-06-29 21:33:36 +020011333 if (PyUnicode_READY(self) == -1) {
11334 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011336 }
11337 if (PyUnicode_READY(substring) == -1) {
11338 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341
Victor Stinner7931d9a2011-11-04 00:22:48 +010011342 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343
11344 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 if (result == -2)
11347 return NULL;
11348
Christian Heimes217cfd12007-12-02 14:31:20 +000011349 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350}
11351
11352static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011353unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011355 void *data;
11356 enum PyUnicode_Kind kind;
11357 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011358
11359 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11360 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011362 }
11363 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11364 PyErr_SetString(PyExc_IndexError, "string index out of range");
11365 return NULL;
11366 }
11367 kind = PyUnicode_KIND(self);
11368 data = PyUnicode_DATA(self);
11369 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011370 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371}
11372
Guido van Rossumc2504932007-09-18 19:42:40 +000011373/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011374 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011375static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011376unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377{
Guido van Rossumc2504932007-09-18 19:42:40 +000011378 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011379 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011380
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011381#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011382 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011383#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 if (_PyUnicode_HASH(self) != -1)
11385 return _PyUnicode_HASH(self);
11386 if (PyUnicode_READY(self) == -1)
11387 return -1;
11388 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011389 /*
11390 We make the hash of the empty string be 0, rather than using
11391 (prefix ^ suffix), since this slightly obfuscates the hash secret
11392 */
11393 if (len == 0) {
11394 _PyUnicode_HASH(self) = 0;
11395 return 0;
11396 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011397 x = _Py_HashBytes(PyUnicode_DATA(self),
11398 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011400 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401}
11402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011403PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011404 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011406Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
11408static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011411 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011412 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011413 Py_ssize_t start;
11414 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
Jesus Ceaac451502011-04-20 17:09:23 +020011416 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11417 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
Christian Heimesd47a0452013-06-29 21:21:37 +020011420 if (PyUnicode_READY(self) == -1) {
11421 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011423 }
11424 if (PyUnicode_READY(substring) == -1) {
11425 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428
Victor Stinner7931d9a2011-11-04 00:22:48 +010011429 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (result == -2)
11434 return NULL;
11435
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436 if (result < 0) {
11437 PyErr_SetString(PyExc_ValueError, "substring not found");
11438 return NULL;
11439 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011440
Christian Heimes217cfd12007-12-02 14:31:20 +000011441 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442}
11443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011444PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011447Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011448at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
11450static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011451unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 Py_ssize_t i, length;
11454 int kind;
11455 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 int cased;
11457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (PyUnicode_READY(self) == -1)
11459 return NULL;
11460 length = PyUnicode_GET_LENGTH(self);
11461 kind = PyUnicode_KIND(self);
11462 data = PyUnicode_DATA(self);
11463
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 if (length == 1)
11466 return PyBool_FromLong(
11467 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011469 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011472
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 for (i = 0; i < length; i++) {
11475 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011476
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11478 return PyBool_FromLong(0);
11479 else if (!cased && Py_UNICODE_ISLOWER(ch))
11480 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011482 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483}
11484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011485PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011488Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011489at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
11491static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011492unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 Py_ssize_t i, length;
11495 int kind;
11496 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 int cased;
11498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (PyUnicode_READY(self) == -1)
11500 return NULL;
11501 length = PyUnicode_GET_LENGTH(self);
11502 kind = PyUnicode_KIND(self);
11503 data = PyUnicode_DATA(self);
11504
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 if (length == 1)
11507 return PyBool_FromLong(
11508 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011510 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011513
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 for (i = 0; i < length; i++) {
11516 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011517
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11519 return PyBool_FromLong(0);
11520 else if (!cased && Py_UNICODE_ISUPPER(ch))
11521 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011523 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524}
11525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011526PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011529Return True if S is a titlecased string and there is at least one\n\
11530character in S, i.e. upper- and titlecase characters may only\n\
11531follow uncased characters and lowercase characters only cased ones.\n\
11532Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533
11534static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011535unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 Py_ssize_t i, length;
11538 int kind;
11539 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540 int cased, previous_is_cased;
11541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 if (PyUnicode_READY(self) == -1)
11543 return NULL;
11544 length = PyUnicode_GET_LENGTH(self);
11545 kind = PyUnicode_KIND(self);
11546 data = PyUnicode_DATA(self);
11547
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 if (length == 1) {
11550 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11551 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11552 (Py_UNICODE_ISUPPER(ch) != 0));
11553 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011555 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011558
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 cased = 0;
11560 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 for (i = 0; i < length; i++) {
11562 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011563
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11565 if (previous_is_cased)
11566 return PyBool_FromLong(0);
11567 previous_is_cased = 1;
11568 cased = 1;
11569 }
11570 else if (Py_UNICODE_ISLOWER(ch)) {
11571 if (!previous_is_cased)
11572 return PyBool_FromLong(0);
11573 previous_is_cased = 1;
11574 cased = 1;
11575 }
11576 else
11577 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011579 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580}
11581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011582PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011585Return True if all characters in S are whitespace\n\
11586and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
11588static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011589unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 Py_ssize_t i, length;
11592 int kind;
11593 void *data;
11594
11595 if (PyUnicode_READY(self) == -1)
11596 return NULL;
11597 length = PyUnicode_GET_LENGTH(self);
11598 kind = PyUnicode_KIND(self);
11599 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 if (length == 1)
11603 return PyBool_FromLong(
11604 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011606 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011608 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 for (i = 0; i < length; i++) {
11611 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011612 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011615 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616}
11617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011618PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011620\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011621Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011622and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011623
11624static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011625unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 Py_ssize_t i, length;
11628 int kind;
11629 void *data;
11630
11631 if (PyUnicode_READY(self) == -1)
11632 return NULL;
11633 length = PyUnicode_GET_LENGTH(self);
11634 kind = PyUnicode_KIND(self);
11635 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011636
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011637 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 if (length == 1)
11639 return PyBool_FromLong(
11640 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011641
11642 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011644 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 for (i = 0; i < length; i++) {
11647 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011649 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011650 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011651}
11652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011653PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011654 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011655\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011656Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011657and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011658
11659static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011660unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 int kind;
11663 void *data;
11664 Py_ssize_t len, i;
11665
11666 if (PyUnicode_READY(self) == -1)
11667 return NULL;
11668
11669 kind = PyUnicode_KIND(self);
11670 data = PyUnicode_DATA(self);
11671 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011672
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011673 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 if (len == 1) {
11675 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11676 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11677 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011678
11679 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011681 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 for (i = 0; i < len; i++) {
11684 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011685 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011686 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011687 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011688 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011689}
11690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011691PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011692 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011694Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011695False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696
11697static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011698unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 Py_ssize_t i, length;
11701 int kind;
11702 void *data;
11703
11704 if (PyUnicode_READY(self) == -1)
11705 return NULL;
11706 length = PyUnicode_GET_LENGTH(self);
11707 kind = PyUnicode_KIND(self);
11708 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 if (length == 1)
11712 return PyBool_FromLong(
11713 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011715 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011717 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 for (i = 0; i < length; i++) {
11720 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011723 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724}
11725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011726PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011727 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011729Return True if all characters in S are digits\n\
11730and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
11732static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011733unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 Py_ssize_t i, length;
11736 int kind;
11737 void *data;
11738
11739 if (PyUnicode_READY(self) == -1)
11740 return NULL;
11741 length = PyUnicode_GET_LENGTH(self);
11742 kind = PyUnicode_KIND(self);
11743 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (length == 1) {
11747 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11748 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011751 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 for (i = 0; i < length; i++) {
11756 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011759 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760}
11761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011765Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011766False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
11768static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011769unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 Py_ssize_t i, length;
11772 int kind;
11773 void *data;
11774
11775 if (PyUnicode_READY(self) == -1)
11776 return NULL;
11777 length = PyUnicode_GET_LENGTH(self);
11778 kind = PyUnicode_KIND(self);
11779 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 if (length == 1)
11783 return PyBool_FromLong(
11784 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011786 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 for (i = 0; i < length; i++) {
11791 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011794 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795}
11796
Martin v. Löwis47383402007-08-15 07:32:56 +000011797int
11798PyUnicode_IsIdentifier(PyObject *self)
11799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 int kind;
11801 void *data;
11802 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011803 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (PyUnicode_READY(self) == -1) {
11806 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 }
11809
11810 /* Special case for empty strings */
11811 if (PyUnicode_GET_LENGTH(self) == 0)
11812 return 0;
11813 kind = PyUnicode_KIND(self);
11814 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011815
11816 /* PEP 3131 says that the first character must be in
11817 XID_Start and subsequent characters in XID_Continue,
11818 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011819 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011820 letters, digits, underscore). However, given the current
11821 definition of XID_Start and XID_Continue, it is sufficient
11822 to check just for these, except that _ must be allowed
11823 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011825 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011826 return 0;
11827
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011828 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011831 return 1;
11832}
11833
11834PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011836\n\
11837Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011838to the language definition.\n\
11839\n\
11840Use keyword.iskeyword() to test for reserved identifiers\n\
11841such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011842
11843static PyObject*
11844unicode_isidentifier(PyObject *self)
11845{
11846 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11847}
11848
Georg Brandl559e5d72008-06-11 18:37:52 +000011849PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011851\n\
11852Return True if all characters in S are considered\n\
11853printable in repr() or S is empty, False otherwise.");
11854
11855static PyObject*
11856unicode_isprintable(PyObject *self)
11857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 Py_ssize_t i, length;
11859 int kind;
11860 void *data;
11861
11862 if (PyUnicode_READY(self) == -1)
11863 return NULL;
11864 length = PyUnicode_GET_LENGTH(self);
11865 kind = PyUnicode_KIND(self);
11866 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011867
11868 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 if (length == 1)
11870 return PyBool_FromLong(
11871 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 for (i = 0; i < length; i++) {
11874 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011875 Py_RETURN_FALSE;
11876 }
11877 }
11878 Py_RETURN_TRUE;
11879}
11880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011881PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011882 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883\n\
11884Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011885iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
11887static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011888unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011890 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891}
11892
Martin v. Löwis18e16552006-02-15 17:27:45 +000011893static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011894unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 if (PyUnicode_READY(self) == -1)
11897 return -1;
11898 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899}
11900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011901PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011904Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011905done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
11907static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011908unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011910 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 Py_UCS4 fillchar = ' ';
11912
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011913 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 return NULL;
11915
Benjamin Petersonbac79492012-01-14 13:34:47 -050011916 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011917 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918
Victor Stinnerc4b49542011-12-11 22:44:26 +010011919 if (PyUnicode_GET_LENGTH(self) >= width)
11920 return unicode_result_unchanged(self);
11921
11922 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923}
11924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011925PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011928Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
11930static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011931unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011933 if (PyUnicode_READY(self) == -1)
11934 return NULL;
11935 if (PyUnicode_IS_ASCII(self))
11936 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011937 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938}
11939
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011940#define LEFTSTRIP 0
11941#define RIGHTSTRIP 1
11942#define BOTHSTRIP 2
11943
11944/* Arrays indexed by above */
11945static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11946
11947#define STRIPNAME(i) (stripformat[i]+3)
11948
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011949/* externally visible for str.strip(unicode) */
11950PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011951_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 void *data;
11954 int kind;
11955 Py_ssize_t i, j, len;
11956 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011957 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11960 return NULL;
11961
11962 kind = PyUnicode_KIND(self);
11963 data = PyUnicode_DATA(self);
11964 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011965 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11967 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011968 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011969
Benjamin Peterson14339b62009-01-31 16:36:08 +000011970 i = 0;
11971 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011972 while (i < len) {
11973 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11974 if (!BLOOM(sepmask, ch))
11975 break;
11976 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11977 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 i++;
11979 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011980 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011981
Benjamin Peterson14339b62009-01-31 16:36:08 +000011982 j = len;
11983 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011984 j--;
11985 while (j >= i) {
11986 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11987 if (!BLOOM(sepmask, ch))
11988 break;
11989 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11990 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011992 }
11993
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011995 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011996
Victor Stinner7931d9a2011-11-04 00:22:48 +010011997 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998}
11999
12000PyObject*
12001PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12002{
12003 unsigned char *data;
12004 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012005 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006
Victor Stinnerde636f32011-10-01 03:55:54 +020012007 if (PyUnicode_READY(self) == -1)
12008 return NULL;
12009
Victor Stinner684d5fd2012-05-03 02:32:34 +020012010 length = PyUnicode_GET_LENGTH(self);
12011 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012012
Victor Stinner684d5fd2012-05-03 02:32:34 +020012013 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012014 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015
Victor Stinnerde636f32011-10-01 03:55:54 +020012016 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012017 PyErr_SetString(PyExc_IndexError, "string index out of range");
12018 return NULL;
12019 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012020 if (start >= length || end < start)
12021 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012022
Victor Stinner684d5fd2012-05-03 02:32:34 +020012023 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012024 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012025 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012026 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012027 }
12028 else {
12029 kind = PyUnicode_KIND(self);
12030 data = PyUnicode_1BYTE_DATA(self);
12031 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012032 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012033 length);
12034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036
12037static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012038do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 Py_ssize_t len, i, j;
12041
12042 if (PyUnicode_READY(self) == -1)
12043 return NULL;
12044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012046
Victor Stinnercc7af722013-04-09 22:39:24 +020012047 if (PyUnicode_IS_ASCII(self)) {
12048 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12049
12050 i = 0;
12051 if (striptype != RIGHTSTRIP) {
12052 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012053 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012054 if (!_Py_ascii_whitespace[ch])
12055 break;
12056 i++;
12057 }
12058 }
12059
12060 j = len;
12061 if (striptype != LEFTSTRIP) {
12062 j--;
12063 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012064 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012065 if (!_Py_ascii_whitespace[ch])
12066 break;
12067 j--;
12068 }
12069 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012070 }
12071 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012072 else {
12073 int kind = PyUnicode_KIND(self);
12074 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012075
Victor Stinnercc7af722013-04-09 22:39:24 +020012076 i = 0;
12077 if (striptype != RIGHTSTRIP) {
12078 while (i < len) {
12079 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12080 if (!Py_UNICODE_ISSPACE(ch))
12081 break;
12082 i++;
12083 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012084 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012085
12086 j = len;
12087 if (striptype != LEFTSTRIP) {
12088 j--;
12089 while (j >= i) {
12090 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12091 if (!Py_UNICODE_ISSPACE(ch))
12092 break;
12093 j--;
12094 }
12095 j++;
12096 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012097 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012098
Victor Stinner7931d9a2011-11-04 00:22:48 +010012099 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100}
12101
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012102
12103static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012104do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012105{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012106 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012107
Serhiy Storchakac6792272013-10-19 21:03:34 +030012108 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012109 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012110
Benjamin Peterson14339b62009-01-31 16:36:08 +000012111 if (sep != NULL && sep != Py_None) {
12112 if (PyUnicode_Check(sep))
12113 return _PyUnicode_XStrip(self, striptype, sep);
12114 else {
12115 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 "%s arg must be None or str",
12117 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012118 return NULL;
12119 }
12120 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121
Benjamin Peterson14339b62009-01-31 16:36:08 +000012122 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012123}
12124
12125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128\n\
12129Return a copy of the string S with leading and trailing\n\
12130whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012131If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012132
12133static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012134unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012136 if (PyTuple_GET_SIZE(args) == 0)
12137 return do_strip(self, BOTHSTRIP); /* Common case */
12138 else
12139 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012140}
12141
12142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012143PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012144 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012145\n\
12146Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012147If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012148
12149static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012150unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012152 if (PyTuple_GET_SIZE(args) == 0)
12153 return do_strip(self, LEFTSTRIP); /* Common case */
12154 else
12155 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012156}
12157
12158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012159PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012160 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012161\n\
12162Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012163If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012164
12165static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012166unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012167{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012168 if (PyTuple_GET_SIZE(args) == 0)
12169 return do_strip(self, RIGHTSTRIP); /* Common case */
12170 else
12171 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012172}
12173
12174
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012176unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012178 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180
Serhiy Storchaka05997252013-01-26 12:14:02 +020012181 if (len < 1)
12182 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183
Victor Stinnerc4b49542011-12-11 22:44:26 +010012184 /* no repeat, return original string */
12185 if (len == 1)
12186 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012187
Benjamin Petersonbac79492012-01-14 13:34:47 -050012188 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 return NULL;
12190
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012191 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012192 PyErr_SetString(PyExc_OverflowError,
12193 "repeated string is too long");
12194 return NULL;
12195 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012197
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012198 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199 if (!u)
12200 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012201 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 if (PyUnicode_GET_LENGTH(str) == 1) {
12204 const int kind = PyUnicode_KIND(str);
12205 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012206 if (kind == PyUnicode_1BYTE_KIND) {
12207 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012208 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012209 }
12210 else if (kind == PyUnicode_2BYTE_KIND) {
12211 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012212 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012213 ucs2[n] = fill_char;
12214 } else {
12215 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12216 assert(kind == PyUnicode_4BYTE_KIND);
12217 for (n = 0; n < len; ++n)
12218 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 }
12221 else {
12222 /* number of characters copied this far */
12223 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012224 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 char *to = (char *) PyUnicode_DATA(u);
12226 Py_MEMCPY(to, PyUnicode_DATA(str),
12227 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012228 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 n = (done <= nchars-done) ? done : nchars-done;
12230 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012231 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233 }
12234
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012235 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012236 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237}
12238
Alexander Belopolsky40018472011-02-26 01:02:56 +000012239PyObject *
12240PyUnicode_Replace(PyObject *obj,
12241 PyObject *subobj,
12242 PyObject *replobj,
12243 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
12245 PyObject *self;
12246 PyObject *str1;
12247 PyObject *str2;
12248 PyObject *result;
12249
12250 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012251 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012254 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 Py_DECREF(self);
12256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 }
12258 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012259 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 Py_DECREF(self);
12261 Py_DECREF(str1);
12262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012264 if (PyUnicode_READY(self) == -1 ||
12265 PyUnicode_READY(str1) == -1 ||
12266 PyUnicode_READY(str2) == -1)
12267 result = NULL;
12268 else
12269 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270 Py_DECREF(self);
12271 Py_DECREF(str1);
12272 Py_DECREF(str2);
12273 return result;
12274}
12275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012276PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012277 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278\n\
12279Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012280old replaced by new. If the optional argument count is\n\
12281given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
12283static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 PyObject *str1;
12287 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012288 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289 PyObject *result;
12290
Martin v. Löwis18e16552006-02-15 17:27:45 +000012291 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012293 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012296 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 return NULL;
12298 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012299 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 Py_DECREF(str1);
12301 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012302 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012303 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12304 result = NULL;
12305 else
12306 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
12308 Py_DECREF(str1);
12309 Py_DECREF(str2);
12310 return result;
12311}
12312
Alexander Belopolsky40018472011-02-26 01:02:56 +000012313static PyObject *
12314unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012316 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 Py_ssize_t isize;
12318 Py_ssize_t osize, squote, dquote, i, o;
12319 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012320 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012324 return NULL;
12325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 isize = PyUnicode_GET_LENGTH(unicode);
12327 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 /* Compute length of output, quote characters, and
12330 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012331 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 max = 127;
12333 squote = dquote = 0;
12334 ikind = PyUnicode_KIND(unicode);
12335 for (i = 0; i < isize; i++) {
12336 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12337 switch (ch) {
12338 case '\'': squote++; osize++; break;
12339 case '"': dquote++; osize++; break;
12340 case '\\': case '\t': case '\r': case '\n':
12341 osize += 2; break;
12342 default:
12343 /* Fast-path ASCII */
12344 if (ch < ' ' || ch == 0x7f)
12345 osize += 4; /* \xHH */
12346 else if (ch < 0x7f)
12347 osize++;
12348 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12349 osize++;
12350 max = ch > max ? ch : max;
12351 }
12352 else if (ch < 0x100)
12353 osize += 4; /* \xHH */
12354 else if (ch < 0x10000)
12355 osize += 6; /* \uHHHH */
12356 else
12357 osize += 10; /* \uHHHHHHHH */
12358 }
12359 }
12360
12361 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012362 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012364 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 if (dquote)
12366 /* Both squote and dquote present. Use squote,
12367 and escape them */
12368 osize += squote;
12369 else
12370 quote = '"';
12371 }
Victor Stinner55c08782013-04-14 18:45:39 +020012372 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373
12374 repr = PyUnicode_New(osize, max);
12375 if (repr == NULL)
12376 return NULL;
12377 okind = PyUnicode_KIND(repr);
12378 odata = PyUnicode_DATA(repr);
12379
12380 PyUnicode_WRITE(okind, odata, 0, quote);
12381 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012382 if (unchanged) {
12383 _PyUnicode_FastCopyCharacters(repr, 1,
12384 unicode, 0,
12385 isize);
12386 }
12387 else {
12388 for (i = 0, o = 1; i < isize; i++) {
12389 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390
Victor Stinner55c08782013-04-14 18:45:39 +020012391 /* Escape quotes and backslashes */
12392 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012393 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012395 continue;
12396 }
12397
12398 /* Map special whitespace to '\t', \n', '\r' */
12399 if (ch == '\t') {
12400 PyUnicode_WRITE(okind, odata, o++, '\\');
12401 PyUnicode_WRITE(okind, odata, o++, 't');
12402 }
12403 else if (ch == '\n') {
12404 PyUnicode_WRITE(okind, odata, o++, '\\');
12405 PyUnicode_WRITE(okind, odata, o++, 'n');
12406 }
12407 else if (ch == '\r') {
12408 PyUnicode_WRITE(okind, odata, o++, '\\');
12409 PyUnicode_WRITE(okind, odata, o++, 'r');
12410 }
12411
12412 /* Map non-printable US ASCII to '\xhh' */
12413 else if (ch < ' ' || ch == 0x7F) {
12414 PyUnicode_WRITE(okind, odata, o++, '\\');
12415 PyUnicode_WRITE(okind, odata, o++, 'x');
12416 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12417 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12418 }
12419
12420 /* Copy ASCII characters as-is */
12421 else if (ch < 0x7F) {
12422 PyUnicode_WRITE(okind, odata, o++, ch);
12423 }
12424
12425 /* Non-ASCII characters */
12426 else {
12427 /* Map Unicode whitespace and control characters
12428 (categories Z* and C* except ASCII space)
12429 */
12430 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12431 PyUnicode_WRITE(okind, odata, o++, '\\');
12432 /* Map 8-bit characters to '\xhh' */
12433 if (ch <= 0xff) {
12434 PyUnicode_WRITE(okind, odata, o++, 'x');
12435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12437 }
12438 /* Map 16-bit characters to '\uxxxx' */
12439 else if (ch <= 0xffff) {
12440 PyUnicode_WRITE(okind, odata, o++, 'u');
12441 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12442 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12443 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12444 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12445 }
12446 /* Map 21-bit characters to '\U00xxxxxx' */
12447 else {
12448 PyUnicode_WRITE(okind, odata, o++, 'U');
12449 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12450 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12451 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12456 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12457 }
12458 }
12459 /* Copy characters as-is */
12460 else {
12461 PyUnicode_WRITE(okind, odata, o++, ch);
12462 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012463 }
12464 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012467 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012468 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469}
12470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012471PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012472 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473\n\
12474Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012475such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476arguments start and end are interpreted as in slice notation.\n\
12477\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012478Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479
12480static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012483 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012484 Py_ssize_t start;
12485 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012486 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487
Jesus Ceaac451502011-04-20 17:09:23 +020012488 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12489 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491
Christian Heimesea71a522013-06-29 21:17:34 +020012492 if (PyUnicode_READY(self) == -1) {
12493 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012495 }
12496 if (PyUnicode_READY(substring) == -1) {
12497 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500
Victor Stinner7931d9a2011-11-04 00:22:48 +010012501 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502
12503 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 if (result == -2)
12506 return NULL;
12507
Christian Heimes217cfd12007-12-02 14:31:20 +000012508 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509}
12510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012511PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012514Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
12516static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012519 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012520 Py_ssize_t start;
12521 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012522 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523
Jesus Ceaac451502011-04-20 17:09:23 +020012524 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12525 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527
Christian Heimesea71a522013-06-29 21:17:34 +020012528 if (PyUnicode_READY(self) == -1) {
12529 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012531 }
12532 if (PyUnicode_READY(substring) == -1) {
12533 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536
Victor Stinner7931d9a2011-11-04 00:22:48 +010012537 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538
12539 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 if (result == -2)
12542 return NULL;
12543
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544 if (result < 0) {
12545 PyErr_SetString(PyExc_ValueError, "substring not found");
12546 return NULL;
12547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548
Christian Heimes217cfd12007-12-02 14:31:20 +000012549 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550}
12551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012552PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012555Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012556done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557
12558static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012559unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012561 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012562 Py_UCS4 fillchar = ' ';
12563
Victor Stinnere9a29352011-10-01 02:14:59 +020012564 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012566
Benjamin Petersonbac79492012-01-14 13:34:47 -050012567 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568 return NULL;
12569
Victor Stinnerc4b49542011-12-11 22:44:26 +010012570 if (PyUnicode_GET_LENGTH(self) >= width)
12571 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572
Victor Stinnerc4b49542011-12-11 22:44:26 +010012573 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574}
12575
Alexander Belopolsky40018472011-02-26 01:02:56 +000012576PyObject *
12577PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578{
12579 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012580
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581 s = PyUnicode_FromObject(s);
12582 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012583 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 if (sep != NULL) {
12585 sep = PyUnicode_FromObject(sep);
12586 if (sep == NULL) {
12587 Py_DECREF(s);
12588 return NULL;
12589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590 }
12591
Victor Stinner9310abb2011-10-05 00:59:23 +020012592 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593
12594 Py_DECREF(s);
12595 Py_XDECREF(sep);
12596 return result;
12597}
12598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012599PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012600 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601\n\
12602Return a list of the words in S, using sep as the\n\
12603delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012604splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012605whitespace string is a separator and empty strings are\n\
12606removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607
12608static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012609unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012611 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012613 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012615 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12616 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617 return NULL;
12618
12619 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012622 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012624 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625}
12626
Thomas Wouters477c8d52006-05-27 19:21:47 +000012627PyObject *
12628PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12629{
12630 PyObject* str_obj;
12631 PyObject* sep_obj;
12632 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 int kind1, kind2, kind;
12634 void *buf1 = NULL, *buf2 = NULL;
12635 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012636
12637 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012638 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012640 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012641 if (!sep_obj) {
12642 Py_DECREF(str_obj);
12643 return NULL;
12644 }
12645 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12646 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012647 Py_DECREF(str_obj);
12648 return NULL;
12649 }
12650
Victor Stinner14f8f022011-10-05 20:58:25 +020012651 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012653 kind = Py_MAX(kind1, kind2);
12654 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012656 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 if (!buf1)
12658 goto onError;
12659 buf2 = PyUnicode_DATA(sep_obj);
12660 if (kind2 != kind)
12661 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12662 if (!buf2)
12663 goto onError;
12664 len1 = PyUnicode_GET_LENGTH(str_obj);
12665 len2 = PyUnicode_GET_LENGTH(sep_obj);
12666
Benjamin Petersonead6b532011-12-20 17:23:42 -060012667 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012669 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12670 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12671 else
12672 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 break;
12674 case PyUnicode_2BYTE_KIND:
12675 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12676 break;
12677 case PyUnicode_4BYTE_KIND:
12678 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12679 break;
12680 default:
12681 assert(0);
12682 out = 0;
12683 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012684
12685 Py_DECREF(sep_obj);
12686 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 if (kind1 != kind)
12688 PyMem_Free(buf1);
12689 if (kind2 != kind)
12690 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012691
12692 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 onError:
12694 Py_DECREF(sep_obj);
12695 Py_DECREF(str_obj);
12696 if (kind1 != kind && buf1)
12697 PyMem_Free(buf1);
12698 if (kind2 != kind && buf2)
12699 PyMem_Free(buf2);
12700 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012701}
12702
12703
12704PyObject *
12705PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12706{
12707 PyObject* str_obj;
12708 PyObject* sep_obj;
12709 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 int kind1, kind2, kind;
12711 void *buf1 = NULL, *buf2 = NULL;
12712 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012713
12714 str_obj = PyUnicode_FromObject(str_in);
12715 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012717 sep_obj = PyUnicode_FromObject(sep_in);
12718 if (!sep_obj) {
12719 Py_DECREF(str_obj);
12720 return NULL;
12721 }
12722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 kind1 = PyUnicode_KIND(str_in);
12724 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012725 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 buf1 = PyUnicode_DATA(str_in);
12727 if (kind1 != kind)
12728 buf1 = _PyUnicode_AsKind(str_in, kind);
12729 if (!buf1)
12730 goto onError;
12731 buf2 = PyUnicode_DATA(sep_obj);
12732 if (kind2 != kind)
12733 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12734 if (!buf2)
12735 goto onError;
12736 len1 = PyUnicode_GET_LENGTH(str_obj);
12737 len2 = PyUnicode_GET_LENGTH(sep_obj);
12738
Benjamin Petersonead6b532011-12-20 17:23:42 -060012739 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012741 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12742 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12743 else
12744 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 break;
12746 case PyUnicode_2BYTE_KIND:
12747 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12748 break;
12749 case PyUnicode_4BYTE_KIND:
12750 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12751 break;
12752 default:
12753 assert(0);
12754 out = 0;
12755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012756
12757 Py_DECREF(sep_obj);
12758 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012759 if (kind1 != kind)
12760 PyMem_Free(buf1);
12761 if (kind2 != kind)
12762 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012763
12764 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 onError:
12766 Py_DECREF(sep_obj);
12767 Py_DECREF(str_obj);
12768 if (kind1 != kind && buf1)
12769 PyMem_Free(buf1);
12770 if (kind2 != kind && buf2)
12771 PyMem_Free(buf2);
12772 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012773}
12774
12775PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012776 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012777\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012778Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012779the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012780found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012781
12782static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012783unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012784{
Victor Stinner9310abb2011-10-05 00:59:23 +020012785 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786}
12787
12788PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012789 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012791Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012792the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012793separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794
12795static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012796unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797{
Victor Stinner9310abb2011-10-05 00:59:23 +020012798 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012799}
12800
Alexander Belopolsky40018472011-02-26 01:02:56 +000012801PyObject *
12802PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012803{
12804 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012805
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012806 s = PyUnicode_FromObject(s);
12807 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 if (sep != NULL) {
12810 sep = PyUnicode_FromObject(sep);
12811 if (sep == NULL) {
12812 Py_DECREF(s);
12813 return NULL;
12814 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012815 }
12816
Victor Stinner9310abb2011-10-05 00:59:23 +020012817 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012818
12819 Py_DECREF(s);
12820 Py_XDECREF(sep);
12821 return result;
12822}
12823
12824PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012825 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012826\n\
12827Return a list of the words in S, using sep as the\n\
12828delimiter string, starting at the end of the string and\n\
12829working to the front. If maxsplit is given, at most maxsplit\n\
12830splits are done. If sep is not specified, any whitespace string\n\
12831is a separator.");
12832
12833static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012834unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012835{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012836 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012837 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012838 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012839
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012840 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12841 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012842 return NULL;
12843
12844 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012846 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012847 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012848 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012849 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012850}
12851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012852PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854\n\
12855Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012856Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012857is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858
12859static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012860unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012862 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012863 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012865 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12866 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867 return NULL;
12868
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012869 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870}
12871
12872static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012873PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012875 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876}
12877
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012878PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880\n\
12881Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012882and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883
12884static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012885unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012887 if (PyUnicode_READY(self) == -1)
12888 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012889 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890}
12891
Larry Hastings61272b72014-01-07 12:41:53 -080012892/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012893
Larry Hastings31826802013-10-19 00:09:25 -070012894@staticmethod
12895str.maketrans as unicode_maketrans
12896
12897 x: object
12898
12899 y: unicode=NULL
12900
12901 z: unicode=NULL
12902
12903 /
12904
12905Return a translation table usable for str.translate().
12906
12907If there is only one argument, it must be a dictionary mapping Unicode
12908ordinals (integers) or characters to Unicode ordinals, strings or None.
12909Character keys will be then converted to ordinals.
12910If there are two arguments, they must be strings of equal length, and
12911in the resulting dictionary, each character in x will be mapped to the
12912character at the same position in y. If there is a third argument, it
12913must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012914[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012915
12916PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012917"maketrans(x, y=None, z=None, /)\n"
12918"--\n"
12919"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012920"Return a translation table usable for str.translate().\n"
12921"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012922"If there is only one argument, it must be a dictionary mapping Unicode\n"
12923"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12924"Character keys will be then converted to ordinals.\n"
12925"If there are two arguments, they must be strings of equal length, and\n"
12926"in the resulting dictionary, each character in x will be mapped to the\n"
12927"character at the same position in y. If there is a third argument, it\n"
12928"must be a string, whose characters will be mapped to None in the result.");
12929
12930#define UNICODE_MAKETRANS_METHODDEF \
12931 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12932
12933static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012934unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012935
12936static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012937unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012938{
Larry Hastings31826802013-10-19 00:09:25 -070012939 PyObject *return_value = NULL;
12940 PyObject *x;
12941 PyObject *y = NULL;
12942 PyObject *z = NULL;
12943
12944 if (!PyArg_ParseTuple(args,
12945 "O|UU:maketrans",
12946 &x, &y, &z))
12947 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012948 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012949
12950exit:
12951 return return_value;
12952}
12953
12954static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012955unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012956/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012957{
Georg Brandlceee0772007-11-27 23:48:05 +000012958 PyObject *new = NULL, *key, *value;
12959 Py_ssize_t i = 0;
12960 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012961
Georg Brandlceee0772007-11-27 23:48:05 +000012962 new = PyDict_New();
12963 if (!new)
12964 return NULL;
12965 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 int x_kind, y_kind, z_kind;
12967 void *x_data, *y_data, *z_data;
12968
Georg Brandlceee0772007-11-27 23:48:05 +000012969 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012970 if (!PyUnicode_Check(x)) {
12971 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12972 "be a string if there is a second argument");
12973 goto err;
12974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012976 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12977 "arguments must have equal length");
12978 goto err;
12979 }
12980 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 x_kind = PyUnicode_KIND(x);
12982 y_kind = PyUnicode_KIND(y);
12983 x_data = PyUnicode_DATA(x);
12984 y_data = PyUnicode_DATA(y);
12985 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12986 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012987 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012988 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012989 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012990 if (!value) {
12991 Py_DECREF(key);
12992 goto err;
12993 }
Georg Brandlceee0772007-11-27 23:48:05 +000012994 res = PyDict_SetItem(new, key, value);
12995 Py_DECREF(key);
12996 Py_DECREF(value);
12997 if (res < 0)
12998 goto err;
12999 }
13000 /* create entries for deleting chars in z */
13001 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 z_kind = PyUnicode_KIND(z);
13003 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013004 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013006 if (!key)
13007 goto err;
13008 res = PyDict_SetItem(new, key, Py_None);
13009 Py_DECREF(key);
13010 if (res < 0)
13011 goto err;
13012 }
13013 }
13014 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 int kind;
13016 void *data;
13017
Georg Brandlceee0772007-11-27 23:48:05 +000013018 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013019 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013020 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13021 "to maketrans it must be a dict");
13022 goto err;
13023 }
13024 /* copy entries into the new dict, converting string keys to int keys */
13025 while (PyDict_Next(x, &i, &key, &value)) {
13026 if (PyUnicode_Check(key)) {
13027 /* convert string keys to integer keys */
13028 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013029 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013030 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13031 "table must be of length 1");
13032 goto err;
13033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 kind = PyUnicode_KIND(key);
13035 data = PyUnicode_DATA(key);
13036 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013037 if (!newkey)
13038 goto err;
13039 res = PyDict_SetItem(new, newkey, value);
13040 Py_DECREF(newkey);
13041 if (res < 0)
13042 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013043 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013044 /* just keep integer keys */
13045 if (PyDict_SetItem(new, key, value) < 0)
13046 goto err;
13047 } else {
13048 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13049 "be strings or integers");
13050 goto err;
13051 }
13052 }
13053 }
13054 return new;
13055 err:
13056 Py_DECREF(new);
13057 return NULL;
13058}
13059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013060PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013061 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062\n\
13063Return a copy of the string S, where all characters have been mapped\n\
13064through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013065Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013066Unmapped characters are left untouched. Characters mapped to None\n\
13067are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068
13069static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073}
13074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013075PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013076 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013078Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079
13080static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013081unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013083 if (PyUnicode_READY(self) == -1)
13084 return NULL;
13085 if (PyUnicode_IS_ASCII(self))
13086 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013087 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088}
13089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013090PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013093Pad a numeric string S with zeros on the left, to fill a field\n\
13094of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095
13096static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013097unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013099 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013100 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013101 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013102 int kind;
13103 void *data;
13104 Py_UCS4 chr;
13105
Martin v. Löwis18e16552006-02-15 17:27:45 +000013106 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107 return NULL;
13108
Benjamin Petersonbac79492012-01-14 13:34:47 -050013109 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013110 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111
Victor Stinnerc4b49542011-12-11 22:44:26 +010013112 if (PyUnicode_GET_LENGTH(self) >= width)
13113 return unicode_result_unchanged(self);
13114
13115 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116
13117 u = pad(self, fill, 0, '0');
13118
Walter Dörwald068325e2002-04-15 13:36:47 +000013119 if (u == NULL)
13120 return NULL;
13121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 kind = PyUnicode_KIND(u);
13123 data = PyUnicode_DATA(u);
13124 chr = PyUnicode_READ(kind, data, fill);
13125
13126 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 PyUnicode_WRITE(kind, data, 0, chr);
13129 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130 }
13131
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013132 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013133 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135
13136#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013137static PyObject *
13138unicode__decimal2ascii(PyObject *self)
13139{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013141}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142#endif
13143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013144PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013145 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013147Return True if S starts with the specified prefix, False otherwise.\n\
13148With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013149With optional end, stop comparing S at that position.\n\
13150prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151
13152static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013153unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013156 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013157 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013158 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013159 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013160 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161
Jesus Ceaac451502011-04-20 17:09:23 +020013162 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013163 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013164 if (PyTuple_Check(subobj)) {
13165 Py_ssize_t i;
13166 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013167 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013168 if (substring == NULL)
13169 return NULL;
13170 result = tailmatch(self, substring, start, end, -1);
13171 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013172 if (result == -1)
13173 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013174 if (result) {
13175 Py_RETURN_TRUE;
13176 }
13177 }
13178 /* nothing matched */
13179 Py_RETURN_FALSE;
13180 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013181 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013182 if (substring == NULL) {
13183 if (PyErr_ExceptionMatches(PyExc_TypeError))
13184 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13185 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013186 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013187 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013188 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013190 if (result == -1)
13191 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013192 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193}
13194
13195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013196PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013199Return True if S ends with the specified suffix, False otherwise.\n\
13200With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013201With optional end, stop comparing S at that position.\n\
13202suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203
13204static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013205unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013208 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013209 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013210 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013211 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013212 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213
Jesus Ceaac451502011-04-20 17:09:23 +020013214 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013215 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013216 if (PyTuple_Check(subobj)) {
13217 Py_ssize_t i;
13218 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013219 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013221 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013222 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013223 result = tailmatch(self, substring, start, end, +1);
13224 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013225 if (result == -1)
13226 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013227 if (result) {
13228 Py_RETURN_TRUE;
13229 }
13230 }
13231 Py_RETURN_FALSE;
13232 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013233 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013234 if (substring == NULL) {
13235 if (PyErr_ExceptionMatches(PyExc_TypeError))
13236 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13237 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013238 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013239 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013240 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013241 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013242 if (result == -1)
13243 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013244 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245}
13246
Victor Stinner202fdca2012-05-07 12:47:02 +020013247Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013248_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013249{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013250 if (!writer->readonly)
13251 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13252 else {
13253 /* Copy-on-write mode: set buffer size to 0 so
13254 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13255 * next write. */
13256 writer->size = 0;
13257 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013258 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13259 writer->data = PyUnicode_DATA(writer->buffer);
13260 writer->kind = PyUnicode_KIND(writer->buffer);
13261}
13262
Victor Stinnerd3f08822012-05-29 12:57:52 +020013263void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013264_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013265{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013266 memset(writer, 0, sizeof(*writer));
13267#ifdef Py_DEBUG
13268 writer->kind = 5; /* invalid kind */
13269#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013270 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013271}
13272
Victor Stinnerd3f08822012-05-29 12:57:52 +020013273int
13274_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13275 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013276{
Victor Stinner6989ba02013-11-18 21:08:39 +010013277#ifdef MS_WINDOWS
13278 /* On Windows, overallocate by 50% is the best factor */
13279# define OVERALLOCATE_FACTOR 2
13280#else
13281 /* On Linux, overallocate by 25% is the best factor */
13282# define OVERALLOCATE_FACTOR 4
13283#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013284 Py_ssize_t newlen;
13285 PyObject *newbuffer;
13286
Victor Stinnerd3f08822012-05-29 12:57:52 +020013287 assert(length > 0);
13288
Victor Stinner202fdca2012-05-07 12:47:02 +020013289 if (length > PY_SSIZE_T_MAX - writer->pos) {
13290 PyErr_NoMemory();
13291 return -1;
13292 }
13293 newlen = writer->pos + length;
13294
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013295 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013296
Victor Stinnerd3f08822012-05-29 12:57:52 +020013297 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013298 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013299 if (writer->overallocate
13300 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13301 /* overallocate to limit the number of realloc() */
13302 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013303 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013304 if (newlen < writer->min_length)
13305 newlen = writer->min_length;
13306
Victor Stinnerd3f08822012-05-29 12:57:52 +020013307 writer->buffer = PyUnicode_New(newlen, maxchar);
13308 if (writer->buffer == NULL)
13309 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013310 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013311 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013312 if (writer->overallocate
13313 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13314 /* overallocate to limit the number of realloc() */
13315 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013316 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013317 if (newlen < writer->min_length)
13318 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013319
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013320 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013321 /* resize + widen */
13322 newbuffer = PyUnicode_New(newlen, maxchar);
13323 if (newbuffer == NULL)
13324 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013325 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13326 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013327 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013328 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013329 }
13330 else {
13331 newbuffer = resize_compact(writer->buffer, newlen);
13332 if (newbuffer == NULL)
13333 return -1;
13334 }
13335 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013336 }
13337 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013338 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013339 newbuffer = PyUnicode_New(writer->size, maxchar);
13340 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013341 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013342 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13343 writer->buffer, 0, writer->pos);
13344 Py_DECREF(writer->buffer);
13345 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013346 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013347 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013348 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013349
13350#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013351}
13352
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013353Py_LOCAL_INLINE(int)
13354_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013355{
13356 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13357 return -1;
13358 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13359 writer->pos++;
13360 return 0;
13361}
13362
13363int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013364_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13365{
13366 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13367}
13368
13369int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013370_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13371{
13372 Py_UCS4 maxchar;
13373 Py_ssize_t len;
13374
13375 if (PyUnicode_READY(str) == -1)
13376 return -1;
13377 len = PyUnicode_GET_LENGTH(str);
13378 if (len == 0)
13379 return 0;
13380 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13381 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013382 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013383 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013384 Py_INCREF(str);
13385 writer->buffer = str;
13386 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013387 writer->pos += len;
13388 return 0;
13389 }
13390 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13391 return -1;
13392 }
13393 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13394 str, 0, len);
13395 writer->pos += len;
13396 return 0;
13397}
13398
Victor Stinnere215d962012-10-06 23:03:36 +020013399int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013400_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13401 Py_ssize_t start, Py_ssize_t end)
13402{
13403 Py_UCS4 maxchar;
13404 Py_ssize_t len;
13405
13406 if (PyUnicode_READY(str) == -1)
13407 return -1;
13408
13409 assert(0 <= start);
13410 assert(end <= PyUnicode_GET_LENGTH(str));
13411 assert(start <= end);
13412
13413 if (end == 0)
13414 return 0;
13415
13416 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13417 return _PyUnicodeWriter_WriteStr(writer, str);
13418
13419 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13420 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13421 else
13422 maxchar = writer->maxchar;
13423 len = end - start;
13424
13425 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13426 return -1;
13427
13428 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13429 str, start, len);
13430 writer->pos += len;
13431 return 0;
13432}
13433
13434int
Victor Stinner4a587072013-11-19 12:54:53 +010013435_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13436 const char *ascii, Py_ssize_t len)
13437{
13438 if (len == -1)
13439 len = strlen(ascii);
13440
13441 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13442
13443 if (writer->buffer == NULL && !writer->overallocate) {
13444 PyObject *str;
13445
13446 str = _PyUnicode_FromASCII(ascii, len);
13447 if (str == NULL)
13448 return -1;
13449
13450 writer->readonly = 1;
13451 writer->buffer = str;
13452 _PyUnicodeWriter_Update(writer);
13453 writer->pos += len;
13454 return 0;
13455 }
13456
13457 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13458 return -1;
13459
13460 switch (writer->kind)
13461 {
13462 case PyUnicode_1BYTE_KIND:
13463 {
13464 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13465 Py_UCS1 *data = writer->data;
13466
13467 Py_MEMCPY(data + writer->pos, str, len);
13468 break;
13469 }
13470 case PyUnicode_2BYTE_KIND:
13471 {
13472 _PyUnicode_CONVERT_BYTES(
13473 Py_UCS1, Py_UCS2,
13474 ascii, ascii + len,
13475 (Py_UCS2 *)writer->data + writer->pos);
13476 break;
13477 }
13478 case PyUnicode_4BYTE_KIND:
13479 {
13480 _PyUnicode_CONVERT_BYTES(
13481 Py_UCS1, Py_UCS4,
13482 ascii, ascii + len,
13483 (Py_UCS4 *)writer->data + writer->pos);
13484 break;
13485 }
13486 default:
13487 assert(0);
13488 }
13489
13490 writer->pos += len;
13491 return 0;
13492}
13493
13494int
13495_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13496 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013497{
13498 Py_UCS4 maxchar;
13499
13500 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13501 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13502 return -1;
13503 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13504 writer->pos += len;
13505 return 0;
13506}
13507
Victor Stinnerd3f08822012-05-29 12:57:52 +020013508PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013509_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013510{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013511 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013512 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013513 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013514 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013515 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013516 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013517 str = writer->buffer;
13518 writer->buffer = NULL;
13519 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13520 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013521 }
13522 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13523 PyObject *newbuffer;
13524 newbuffer = resize_compact(writer->buffer, writer->pos);
13525 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013526 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013527 return NULL;
13528 }
13529 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013530 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013531 str = writer->buffer;
13532 writer->buffer = NULL;
13533 assert(_PyUnicode_CheckConsistency(str, 1));
13534 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013535}
13536
Victor Stinnerd3f08822012-05-29 12:57:52 +020013537void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013538_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013539{
13540 Py_CLEAR(writer->buffer);
13541}
13542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013543#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013544
13545PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013547\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013548Return a formatted version of S, using substitutions from args and kwargs.\n\
13549The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013550
Eric Smith27bbca62010-11-04 17:06:58 +000013551PyDoc_STRVAR(format_map__doc__,
13552 "S.format_map(mapping) -> str\n\
13553\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013554Return a formatted version of S, using substitutions from mapping.\n\
13555The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013556
Eric Smith4a7d76d2008-05-30 18:10:19 +000013557static PyObject *
13558unicode__format__(PyObject* self, PyObject* args)
13559{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560 PyObject *format_spec;
13561 _PyUnicodeWriter writer;
13562 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013563
13564 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13565 return NULL;
13566
Victor Stinnerd3f08822012-05-29 12:57:52 +020013567 if (PyUnicode_READY(self) == -1)
13568 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013569 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013570 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13571 self, format_spec, 0,
13572 PyUnicode_GET_LENGTH(format_spec));
13573 if (ret == -1) {
13574 _PyUnicodeWriter_Dealloc(&writer);
13575 return NULL;
13576 }
13577 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013578}
13579
Eric Smith8c663262007-08-25 02:26:07 +000013580PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013582\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013583Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013584
13585static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013586unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013587{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013588 Py_ssize_t size;
13589
13590 /* If it's a compact object, account for base structure +
13591 character data. */
13592 if (PyUnicode_IS_COMPACT_ASCII(v))
13593 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13594 else if (PyUnicode_IS_COMPACT(v))
13595 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013596 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597 else {
13598 /* If it is a two-block object, account for base object, and
13599 for character block if present. */
13600 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013601 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013603 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013604 }
13605 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013606 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013607 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013609 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013610 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611
13612 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013613}
13614
13615PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013616 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013617
13618static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013619unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013620{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013621 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013622 if (!copy)
13623 return NULL;
13624 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013625}
13626
Guido van Rossumd57fd912000-03-10 22:53:23 +000013627static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013628 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013629 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013630 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13631 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013632 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13633 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013634 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013635 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13636 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13637 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013638 {"expandtabs", (PyCFunction) unicode_expandtabs,
13639 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013640 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013641 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013642 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13643 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13644 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013645 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013646 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13647 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13648 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013649 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013650 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013651 {"splitlines", (PyCFunction) unicode_splitlines,
13652 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013653 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013654 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13655 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13656 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13657 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13658 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13659 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13660 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13661 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13662 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13663 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13664 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13665 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13666 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13667 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013668 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013669 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013670 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013671 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013672 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013673 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013674 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013675 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013676#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013677 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013678 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679#endif
13680
Benjamin Peterson14339b62009-01-31 16:36:08 +000013681 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013682 {NULL, NULL}
13683};
13684
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013685static PyObject *
13686unicode_mod(PyObject *v, PyObject *w)
13687{
Brian Curtindfc80e32011-08-10 20:28:54 -050013688 if (!PyUnicode_Check(v))
13689 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013690 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013691}
13692
13693static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013694 0, /*nb_add*/
13695 0, /*nb_subtract*/
13696 0, /*nb_multiply*/
13697 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013698};
13699
Guido van Rossumd57fd912000-03-10 22:53:23 +000013700static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013701 (lenfunc) unicode_length, /* sq_length */
13702 PyUnicode_Concat, /* sq_concat */
13703 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13704 (ssizeargfunc) unicode_getitem, /* sq_item */
13705 0, /* sq_slice */
13706 0, /* sq_ass_item */
13707 0, /* sq_ass_slice */
13708 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013709};
13710
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013711static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013712unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013713{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013714 if (PyUnicode_READY(self) == -1)
13715 return NULL;
13716
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013717 if (PyIndex_Check(item)) {
13718 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013719 if (i == -1 && PyErr_Occurred())
13720 return NULL;
13721 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013722 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013723 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013724 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013725 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013726 PyObject *result;
13727 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013728 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013729 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013731 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013732 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013733 return NULL;
13734 }
13735
13736 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013737 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013738 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013739 slicelength == PyUnicode_GET_LENGTH(self)) {
13740 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013741 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013742 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013743 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013744 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013745 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013746 src_kind = PyUnicode_KIND(self);
13747 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013748 if (!PyUnicode_IS_ASCII(self)) {
13749 kind_limit = kind_maxchar_limit(src_kind);
13750 max_char = 0;
13751 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13752 ch = PyUnicode_READ(src_kind, src_data, cur);
13753 if (ch > max_char) {
13754 max_char = ch;
13755 if (max_char >= kind_limit)
13756 break;
13757 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013758 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013759 }
Victor Stinner55c99112011-10-13 01:17:06 +020013760 else
13761 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013762 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013763 if (result == NULL)
13764 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013765 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013766 dest_data = PyUnicode_DATA(result);
13767
13768 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013769 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13770 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013771 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013772 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013773 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013774 } else {
13775 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13776 return NULL;
13777 }
13778}
13779
13780static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013781 (lenfunc)unicode_length, /* mp_length */
13782 (binaryfunc)unicode_subscript, /* mp_subscript */
13783 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013784};
13785
Guido van Rossumd57fd912000-03-10 22:53:23 +000013786
Guido van Rossumd57fd912000-03-10 22:53:23 +000013787/* Helpers for PyUnicode_Format() */
13788
Victor Stinnera47082312012-10-04 02:19:54 +020013789struct unicode_formatter_t {
13790 PyObject *args;
13791 int args_owned;
13792 Py_ssize_t arglen, argidx;
13793 PyObject *dict;
13794
13795 enum PyUnicode_Kind fmtkind;
13796 Py_ssize_t fmtcnt, fmtpos;
13797 void *fmtdata;
13798 PyObject *fmtstr;
13799
13800 _PyUnicodeWriter writer;
13801};
13802
13803struct unicode_format_arg_t {
13804 Py_UCS4 ch;
13805 int flags;
13806 Py_ssize_t width;
13807 int prec;
13808 int sign;
13809};
13810
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013812unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013813{
Victor Stinnera47082312012-10-04 02:19:54 +020013814 Py_ssize_t argidx = ctx->argidx;
13815
13816 if (argidx < ctx->arglen) {
13817 ctx->argidx++;
13818 if (ctx->arglen < 0)
13819 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013820 else
Victor Stinnera47082312012-10-04 02:19:54 +020013821 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822 }
13823 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013824 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013825 return NULL;
13826}
13827
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013828/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013829
Victor Stinnera47082312012-10-04 02:19:54 +020013830/* Format a float into the writer if the writer is not NULL, or into *p_output
13831 otherwise.
13832
13833 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013834static int
Victor Stinnera47082312012-10-04 02:19:54 +020013835formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13836 PyObject **p_output,
13837 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013839 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013840 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013842 int prec;
13843 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013844
Guido van Rossumd57fd912000-03-10 22:53:23 +000013845 x = PyFloat_AsDouble(v);
13846 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013847 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013848
Victor Stinnera47082312012-10-04 02:19:54 +020013849 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013850 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013851 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013852
Victor Stinnera47082312012-10-04 02:19:54 +020013853 if (arg->flags & F_ALT)
13854 dtoa_flags = Py_DTSF_ALT;
13855 else
13856 dtoa_flags = 0;
13857 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013858 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013859 return -1;
13860 len = strlen(p);
13861 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013862 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013863 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013864 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013865 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013866 }
13867 else
13868 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013869 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013870 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871}
13872
Victor Stinnerd0880d52012-04-27 23:40:13 +020013873/* formatlong() emulates the format codes d, u, o, x and X, and
13874 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13875 * Python's regular ints.
13876 * Return value: a new PyUnicodeObject*, or NULL if error.
13877 * The output string is of the form
13878 * "-"? ("0x" | "0X")? digit+
13879 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13880 * set in flags. The case of hex digits will be correct,
13881 * There will be at least prec digits, zero-filled on the left if
13882 * necessary to get that many.
13883 * val object to be converted
13884 * flags bitmask of format flags; only F_ALT is looked at
13885 * prec minimum number of digits; 0-fill on left if needed
13886 * type a character in [duoxX]; u acts the same as d
13887 *
13888 * CAUTION: o, x and X conversions on regular ints can never
13889 * produce a '-' sign, but can for Python's unbounded ints.
13890 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013891static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013892formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013893{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013894 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013895 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013896 Py_ssize_t i;
13897 int sign; /* 1 if '-', else 0 */
13898 int len; /* number of characters */
13899 Py_ssize_t llen;
13900 int numdigits; /* len == numnondigits + numdigits */
13901 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013902 int prec = arg->prec;
13903 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013904
Victor Stinnerd0880d52012-04-27 23:40:13 +020013905 /* Avoid exceeding SSIZE_T_MAX */
13906 if (prec > INT_MAX-3) {
13907 PyErr_SetString(PyExc_OverflowError,
13908 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013909 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013910 }
13911
13912 assert(PyLong_Check(val));
13913
13914 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013915 default:
13916 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013917 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013918 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013919 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013920 /* int and int subclasses should print numerically when a numeric */
13921 /* format code is used (see issue18780) */
13922 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013923 break;
13924 case 'o':
13925 numnondigits = 2;
13926 result = PyNumber_ToBase(val, 8);
13927 break;
13928 case 'x':
13929 case 'X':
13930 numnondigits = 2;
13931 result = PyNumber_ToBase(val, 16);
13932 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013933 }
13934 if (!result)
13935 return NULL;
13936
13937 assert(unicode_modifiable(result));
13938 assert(PyUnicode_IS_READY(result));
13939 assert(PyUnicode_IS_ASCII(result));
13940
13941 /* To modify the string in-place, there can only be one reference. */
13942 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013943 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013944 PyErr_BadInternalCall();
13945 return NULL;
13946 }
13947 buf = PyUnicode_DATA(result);
13948 llen = PyUnicode_GET_LENGTH(result);
13949 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013950 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013951 PyErr_SetString(PyExc_ValueError,
13952 "string too large in _PyBytes_FormatLong");
13953 return NULL;
13954 }
13955 len = (int)llen;
13956 sign = buf[0] == '-';
13957 numnondigits += sign;
13958 numdigits = len - numnondigits;
13959 assert(numdigits > 0);
13960
13961 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013962 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013963 (type == 'o' || type == 'x' || type == 'X'))) {
13964 assert(buf[sign] == '0');
13965 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13966 buf[sign+1] == 'o');
13967 numnondigits -= 2;
13968 buf += 2;
13969 len -= 2;
13970 if (sign)
13971 buf[0] = '-';
13972 assert(len == numnondigits + numdigits);
13973 assert(numdigits > 0);
13974 }
13975
13976 /* Fill with leading zeroes to meet minimum width. */
13977 if (prec > numdigits) {
13978 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13979 numnondigits + prec);
13980 char *b1;
13981 if (!r1) {
13982 Py_DECREF(result);
13983 return NULL;
13984 }
13985 b1 = PyBytes_AS_STRING(r1);
13986 for (i = 0; i < numnondigits; ++i)
13987 *b1++ = *buf++;
13988 for (i = 0; i < prec - numdigits; i++)
13989 *b1++ = '0';
13990 for (i = 0; i < numdigits; i++)
13991 *b1++ = *buf++;
13992 *b1 = '\0';
13993 Py_DECREF(result);
13994 result = r1;
13995 buf = PyBytes_AS_STRING(result);
13996 len = numnondigits + prec;
13997 }
13998
13999 /* Fix up case for hex conversions. */
14000 if (type == 'X') {
14001 /* Need to convert all lower case letters to upper case.
14002 and need to convert 0x to 0X (and -0x to -0X). */
14003 for (i = 0; i < len; i++)
14004 if (buf[i] >= 'a' && buf[i] <= 'x')
14005 buf[i] -= 'a'-'A';
14006 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014007 if (!PyUnicode_Check(result)
14008 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014009 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014010 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014011 Py_DECREF(result);
14012 result = unicode;
14013 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014014 else if (len != PyUnicode_GET_LENGTH(result)) {
14015 if (PyUnicode_Resize(&result, len) < 0)
14016 Py_CLEAR(result);
14017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014018 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014019}
14020
Ethan Furmandf3ed242014-01-05 06:50:30 -080014021/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014022 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014023 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014024 * -1 and raise an exception on error */
14025static int
Victor Stinnera47082312012-10-04 02:19:54 +020014026mainformatlong(PyObject *v,
14027 struct unicode_format_arg_t *arg,
14028 PyObject **p_output,
14029 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014030{
14031 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014032 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014033
14034 if (!PyNumber_Check(v))
14035 goto wrongtype;
14036
Ethan Furman9ab74802014-03-21 06:38:46 -070014037 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014038 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014039 if (type == 'o' || type == 'x' || type == 'X') {
14040 iobj = PyNumber_Index(v);
14041 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014042 if (PyErr_ExceptionMatches(PyExc_TypeError))
14043 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014044 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014045 }
14046 }
14047 else {
14048 iobj = PyNumber_Long(v);
14049 if (iobj == NULL ) {
14050 if (PyErr_ExceptionMatches(PyExc_TypeError))
14051 goto wrongtype;
14052 return -1;
14053 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014054 }
14055 assert(PyLong_Check(iobj));
14056 }
14057 else {
14058 iobj = v;
14059 Py_INCREF(iobj);
14060 }
14061
14062 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014063 && arg->width == -1 && arg->prec == -1
14064 && !(arg->flags & (F_SIGN | F_BLANK))
14065 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014066 {
14067 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014068 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014069 int base;
14070
Victor Stinnera47082312012-10-04 02:19:54 +020014071 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014072 {
14073 default:
14074 assert(0 && "'type' not in [diuoxX]");
14075 case 'd':
14076 case 'i':
14077 case 'u':
14078 base = 10;
14079 break;
14080 case 'o':
14081 base = 8;
14082 break;
14083 case 'x':
14084 case 'X':
14085 base = 16;
14086 break;
14087 }
14088
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014089 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14090 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014091 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014092 }
14093 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014094 return 1;
14095 }
14096
Victor Stinnera47082312012-10-04 02:19:54 +020014097 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014098 Py_DECREF(iobj);
14099 if (res == NULL)
14100 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014101 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014102 return 0;
14103
14104wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014105 switch(type)
14106 {
14107 case 'o':
14108 case 'x':
14109 case 'X':
14110 PyErr_Format(PyExc_TypeError,
14111 "%%%c format: an integer is required, "
14112 "not %.200s",
14113 type, Py_TYPE(v)->tp_name);
14114 break;
14115 default:
14116 PyErr_Format(PyExc_TypeError,
14117 "%%%c format: a number is required, "
14118 "not %.200s",
14119 type, Py_TYPE(v)->tp_name);
14120 break;
14121 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014122 return -1;
14123}
14124
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014125static Py_UCS4
14126formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014127{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014128 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014129 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014130 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014131 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014132 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014133 goto onError;
14134 }
14135 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014136 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014138 /* make sure number is a type of integer */
14139 if (!PyLong_Check(v)) {
14140 iobj = PyNumber_Index(v);
14141 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014142 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014143 }
14144 v = iobj;
14145 Py_DECREF(iobj);
14146 }
14147 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014148 x = PyLong_AsLong(v);
14149 if (x == -1 && PyErr_Occurred())
14150 goto onError;
14151
Victor Stinner8faf8212011-12-08 22:14:11 +010014152 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014153 PyErr_SetString(PyExc_OverflowError,
14154 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014155 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014156 }
14157
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014158 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014160
Benjamin Peterson29060642009-01-31 22:14:21 +000014161 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014162 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014163 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014164 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014165}
14166
Victor Stinnera47082312012-10-04 02:19:54 +020014167/* Parse options of an argument: flags, width, precision.
14168 Handle also "%(name)" syntax.
14169
14170 Return 0 if the argument has been formatted into arg->str.
14171 Return 1 if the argument has been written into ctx->writer,
14172 Raise an exception and return -1 on error. */
14173static int
14174unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14175 struct unicode_format_arg_t *arg)
14176{
14177#define FORMAT_READ(ctx) \
14178 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14179
14180 PyObject *v;
14181
Victor Stinnera47082312012-10-04 02:19:54 +020014182 if (arg->ch == '(') {
14183 /* Get argument value from a dictionary. Example: "%(name)s". */
14184 Py_ssize_t keystart;
14185 Py_ssize_t keylen;
14186 PyObject *key;
14187 int pcount = 1;
14188
14189 if (ctx->dict == NULL) {
14190 PyErr_SetString(PyExc_TypeError,
14191 "format requires a mapping");
14192 return -1;
14193 }
14194 ++ctx->fmtpos;
14195 --ctx->fmtcnt;
14196 keystart = ctx->fmtpos;
14197 /* Skip over balanced parentheses */
14198 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14199 arg->ch = FORMAT_READ(ctx);
14200 if (arg->ch == ')')
14201 --pcount;
14202 else if (arg->ch == '(')
14203 ++pcount;
14204 ctx->fmtpos++;
14205 }
14206 keylen = ctx->fmtpos - keystart - 1;
14207 if (ctx->fmtcnt < 0 || pcount > 0) {
14208 PyErr_SetString(PyExc_ValueError,
14209 "incomplete format key");
14210 return -1;
14211 }
14212 key = PyUnicode_Substring(ctx->fmtstr,
14213 keystart, keystart + keylen);
14214 if (key == NULL)
14215 return -1;
14216 if (ctx->args_owned) {
14217 Py_DECREF(ctx->args);
14218 ctx->args_owned = 0;
14219 }
14220 ctx->args = PyObject_GetItem(ctx->dict, key);
14221 Py_DECREF(key);
14222 if (ctx->args == NULL)
14223 return -1;
14224 ctx->args_owned = 1;
14225 ctx->arglen = -1;
14226 ctx->argidx = -2;
14227 }
14228
14229 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014230 while (--ctx->fmtcnt >= 0) {
14231 arg->ch = FORMAT_READ(ctx);
14232 ctx->fmtpos++;
14233 switch (arg->ch) {
14234 case '-': arg->flags |= F_LJUST; continue;
14235 case '+': arg->flags |= F_SIGN; continue;
14236 case ' ': arg->flags |= F_BLANK; continue;
14237 case '#': arg->flags |= F_ALT; continue;
14238 case '0': arg->flags |= F_ZERO; continue;
14239 }
14240 break;
14241 }
14242
14243 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014244 if (arg->ch == '*') {
14245 v = unicode_format_getnextarg(ctx);
14246 if (v == NULL)
14247 return -1;
14248 if (!PyLong_Check(v)) {
14249 PyErr_SetString(PyExc_TypeError,
14250 "* wants int");
14251 return -1;
14252 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014253 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014254 if (arg->width == -1 && PyErr_Occurred())
14255 return -1;
14256 if (arg->width < 0) {
14257 arg->flags |= F_LJUST;
14258 arg->width = -arg->width;
14259 }
14260 if (--ctx->fmtcnt >= 0) {
14261 arg->ch = FORMAT_READ(ctx);
14262 ctx->fmtpos++;
14263 }
14264 }
14265 else if (arg->ch >= '0' && arg->ch <= '9') {
14266 arg->width = arg->ch - '0';
14267 while (--ctx->fmtcnt >= 0) {
14268 arg->ch = FORMAT_READ(ctx);
14269 ctx->fmtpos++;
14270 if (arg->ch < '0' || arg->ch > '9')
14271 break;
14272 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14273 mixing signed and unsigned comparison. Since arg->ch is between
14274 '0' and '9', casting to int is safe. */
14275 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14276 PyErr_SetString(PyExc_ValueError,
14277 "width too big");
14278 return -1;
14279 }
14280 arg->width = arg->width*10 + (arg->ch - '0');
14281 }
14282 }
14283
14284 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014285 if (arg->ch == '.') {
14286 arg->prec = 0;
14287 if (--ctx->fmtcnt >= 0) {
14288 arg->ch = FORMAT_READ(ctx);
14289 ctx->fmtpos++;
14290 }
14291 if (arg->ch == '*') {
14292 v = unicode_format_getnextarg(ctx);
14293 if (v == NULL)
14294 return -1;
14295 if (!PyLong_Check(v)) {
14296 PyErr_SetString(PyExc_TypeError,
14297 "* wants int");
14298 return -1;
14299 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014300 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014301 if (arg->prec == -1 && PyErr_Occurred())
14302 return -1;
14303 if (arg->prec < 0)
14304 arg->prec = 0;
14305 if (--ctx->fmtcnt >= 0) {
14306 arg->ch = FORMAT_READ(ctx);
14307 ctx->fmtpos++;
14308 }
14309 }
14310 else if (arg->ch >= '0' && arg->ch <= '9') {
14311 arg->prec = arg->ch - '0';
14312 while (--ctx->fmtcnt >= 0) {
14313 arg->ch = FORMAT_READ(ctx);
14314 ctx->fmtpos++;
14315 if (arg->ch < '0' || arg->ch > '9')
14316 break;
14317 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14318 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014319 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014320 return -1;
14321 }
14322 arg->prec = arg->prec*10 + (arg->ch - '0');
14323 }
14324 }
14325 }
14326
14327 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14328 if (ctx->fmtcnt >= 0) {
14329 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14330 if (--ctx->fmtcnt >= 0) {
14331 arg->ch = FORMAT_READ(ctx);
14332 ctx->fmtpos++;
14333 }
14334 }
14335 }
14336 if (ctx->fmtcnt < 0) {
14337 PyErr_SetString(PyExc_ValueError,
14338 "incomplete format");
14339 return -1;
14340 }
14341 return 0;
14342
14343#undef FORMAT_READ
14344}
14345
14346/* Format one argument. Supported conversion specifiers:
14347
14348 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014349 - "i", "d", "u": int or float
14350 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014351 - "e", "E", "f", "F", "g", "G": float
14352 - "c": int or str (1 character)
14353
Victor Stinner8dbd4212012-12-04 09:30:24 +010014354 When possible, the output is written directly into the Unicode writer
14355 (ctx->writer). A string is created when padding is required.
14356
Victor Stinnera47082312012-10-04 02:19:54 +020014357 Return 0 if the argument has been formatted into *p_str,
14358 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014359 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014360static int
14361unicode_format_arg_format(struct unicode_formatter_t *ctx,
14362 struct unicode_format_arg_t *arg,
14363 PyObject **p_str)
14364{
14365 PyObject *v;
14366 _PyUnicodeWriter *writer = &ctx->writer;
14367
14368 if (ctx->fmtcnt == 0)
14369 ctx->writer.overallocate = 0;
14370
14371 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014372 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014373 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014374 return 1;
14375 }
14376
14377 v = unicode_format_getnextarg(ctx);
14378 if (v == NULL)
14379 return -1;
14380
Victor Stinnera47082312012-10-04 02:19:54 +020014381
14382 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014383 case 's':
14384 case 'r':
14385 case 'a':
14386 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14387 /* Fast path */
14388 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14389 return -1;
14390 return 1;
14391 }
14392
14393 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14394 *p_str = v;
14395 Py_INCREF(*p_str);
14396 }
14397 else {
14398 if (arg->ch == 's')
14399 *p_str = PyObject_Str(v);
14400 else if (arg->ch == 'r')
14401 *p_str = PyObject_Repr(v);
14402 else
14403 *p_str = PyObject_ASCII(v);
14404 }
14405 break;
14406
14407 case 'i':
14408 case 'd':
14409 case 'u':
14410 case 'o':
14411 case 'x':
14412 case 'X':
14413 {
14414 int ret = mainformatlong(v, arg, p_str, writer);
14415 if (ret != 0)
14416 return ret;
14417 arg->sign = 1;
14418 break;
14419 }
14420
14421 case 'e':
14422 case 'E':
14423 case 'f':
14424 case 'F':
14425 case 'g':
14426 case 'G':
14427 if (arg->width == -1 && arg->prec == -1
14428 && !(arg->flags & (F_SIGN | F_BLANK)))
14429 {
14430 /* Fast path */
14431 if (formatfloat(v, arg, NULL, writer) == -1)
14432 return -1;
14433 return 1;
14434 }
14435
14436 arg->sign = 1;
14437 if (formatfloat(v, arg, p_str, NULL) == -1)
14438 return -1;
14439 break;
14440
14441 case 'c':
14442 {
14443 Py_UCS4 ch = formatchar(v);
14444 if (ch == (Py_UCS4) -1)
14445 return -1;
14446 if (arg->width == -1 && arg->prec == -1) {
14447 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014448 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014449 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014450 return 1;
14451 }
14452 *p_str = PyUnicode_FromOrdinal(ch);
14453 break;
14454 }
14455
14456 default:
14457 PyErr_Format(PyExc_ValueError,
14458 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014459 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014460 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14461 (int)arg->ch,
14462 ctx->fmtpos - 1);
14463 return -1;
14464 }
14465 if (*p_str == NULL)
14466 return -1;
14467 assert (PyUnicode_Check(*p_str));
14468 return 0;
14469}
14470
14471static int
14472unicode_format_arg_output(struct unicode_formatter_t *ctx,
14473 struct unicode_format_arg_t *arg,
14474 PyObject *str)
14475{
14476 Py_ssize_t len;
14477 enum PyUnicode_Kind kind;
14478 void *pbuf;
14479 Py_ssize_t pindex;
14480 Py_UCS4 signchar;
14481 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014482 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014483 Py_ssize_t sublen;
14484 _PyUnicodeWriter *writer = &ctx->writer;
14485 Py_UCS4 fill;
14486
14487 fill = ' ';
14488 if (arg->sign && arg->flags & F_ZERO)
14489 fill = '0';
14490
14491 if (PyUnicode_READY(str) == -1)
14492 return -1;
14493
14494 len = PyUnicode_GET_LENGTH(str);
14495 if ((arg->width == -1 || arg->width <= len)
14496 && (arg->prec == -1 || arg->prec >= len)
14497 && !(arg->flags & (F_SIGN | F_BLANK)))
14498 {
14499 /* Fast path */
14500 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14501 return -1;
14502 return 0;
14503 }
14504
14505 /* Truncate the string for "s", "r" and "a" formats
14506 if the precision is set */
14507 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14508 if (arg->prec >= 0 && len > arg->prec)
14509 len = arg->prec;
14510 }
14511
14512 /* Adjust sign and width */
14513 kind = PyUnicode_KIND(str);
14514 pbuf = PyUnicode_DATA(str);
14515 pindex = 0;
14516 signchar = '\0';
14517 if (arg->sign) {
14518 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14519 if (ch == '-' || ch == '+') {
14520 signchar = ch;
14521 len--;
14522 pindex++;
14523 }
14524 else if (arg->flags & F_SIGN)
14525 signchar = '+';
14526 else if (arg->flags & F_BLANK)
14527 signchar = ' ';
14528 else
14529 arg->sign = 0;
14530 }
14531 if (arg->width < len)
14532 arg->width = len;
14533
14534 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014535 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014536 if (!(arg->flags & F_LJUST)) {
14537 if (arg->sign) {
14538 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014539 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014540 }
14541 else {
14542 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014543 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014544 }
14545 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014546 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14547 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014548 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014549 }
14550
Victor Stinnera47082312012-10-04 02:19:54 +020014551 buflen = arg->width;
14552 if (arg->sign && len == arg->width)
14553 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014554 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014555 return -1;
14556
14557 /* Write the sign if needed */
14558 if (arg->sign) {
14559 if (fill != ' ') {
14560 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14561 writer->pos += 1;
14562 }
14563 if (arg->width > len)
14564 arg->width--;
14565 }
14566
14567 /* Write the numeric prefix for "x", "X" and "o" formats
14568 if the alternate form is used.
14569 For example, write "0x" for the "%#x" format. */
14570 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14571 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14572 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14573 if (fill != ' ') {
14574 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14575 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14576 writer->pos += 2;
14577 pindex += 2;
14578 }
14579 arg->width -= 2;
14580 if (arg->width < 0)
14581 arg->width = 0;
14582 len -= 2;
14583 }
14584
14585 /* Pad left with the fill character if needed */
14586 if (arg->width > len && !(arg->flags & F_LJUST)) {
14587 sublen = arg->width - len;
14588 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14589 writer->pos += sublen;
14590 arg->width = len;
14591 }
14592
14593 /* If padding with spaces: write sign if needed and/or numeric prefix if
14594 the alternate form is used */
14595 if (fill == ' ') {
14596 if (arg->sign) {
14597 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14598 writer->pos += 1;
14599 }
14600 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14601 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14602 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14603 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14604 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14605 writer->pos += 2;
14606 pindex += 2;
14607 }
14608 }
14609
14610 /* Write characters */
14611 if (len) {
14612 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14613 str, pindex, len);
14614 writer->pos += len;
14615 }
14616
14617 /* Pad right with the fill character if needed */
14618 if (arg->width > len) {
14619 sublen = arg->width - len;
14620 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14621 writer->pos += sublen;
14622 }
14623 return 0;
14624}
14625
14626/* Helper of PyUnicode_Format(): format one arg.
14627 Return 0 on success, raise an exception and return -1 on error. */
14628static int
14629unicode_format_arg(struct unicode_formatter_t *ctx)
14630{
14631 struct unicode_format_arg_t arg;
14632 PyObject *str;
14633 int ret;
14634
Victor Stinner8dbd4212012-12-04 09:30:24 +010014635 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14636 arg.flags = 0;
14637 arg.width = -1;
14638 arg.prec = -1;
14639 arg.sign = 0;
14640 str = NULL;
14641
Victor Stinnera47082312012-10-04 02:19:54 +020014642 ret = unicode_format_arg_parse(ctx, &arg);
14643 if (ret == -1)
14644 return -1;
14645
14646 ret = unicode_format_arg_format(ctx, &arg, &str);
14647 if (ret == -1)
14648 return -1;
14649
14650 if (ret != 1) {
14651 ret = unicode_format_arg_output(ctx, &arg, str);
14652 Py_DECREF(str);
14653 if (ret == -1)
14654 return -1;
14655 }
14656
14657 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14658 PyErr_SetString(PyExc_TypeError,
14659 "not all arguments converted during string formatting");
14660 return -1;
14661 }
14662 return 0;
14663}
14664
Alexander Belopolsky40018472011-02-26 01:02:56 +000014665PyObject *
14666PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014667{
Victor Stinnera47082312012-10-04 02:19:54 +020014668 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014669
Guido van Rossumd57fd912000-03-10 22:53:23 +000014670 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014671 PyErr_BadInternalCall();
14672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014673 }
Victor Stinnera47082312012-10-04 02:19:54 +020014674
14675 ctx.fmtstr = PyUnicode_FromObject(format);
14676 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014677 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014678 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14679 Py_DECREF(ctx.fmtstr);
14680 return NULL;
14681 }
14682 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14683 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14684 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14685 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014686
Victor Stinner8f674cc2013-04-17 23:02:17 +020014687 _PyUnicodeWriter_Init(&ctx.writer);
14688 ctx.writer.min_length = ctx.fmtcnt + 100;
14689 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014690
Guido van Rossumd57fd912000-03-10 22:53:23 +000014691 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014692 ctx.arglen = PyTuple_Size(args);
14693 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014694 }
14695 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014696 ctx.arglen = -1;
14697 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014698 }
Victor Stinnera47082312012-10-04 02:19:54 +020014699 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014700 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014701 ctx.dict = args;
14702 else
14703 ctx.dict = NULL;
14704 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014705
Victor Stinnera47082312012-10-04 02:19:54 +020014706 while (--ctx.fmtcnt >= 0) {
14707 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014708 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014709
14710 nonfmtpos = ctx.fmtpos++;
14711 while (ctx.fmtcnt >= 0 &&
14712 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14713 ctx.fmtpos++;
14714 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014715 }
Victor Stinnera47082312012-10-04 02:19:54 +020014716 if (ctx.fmtcnt < 0) {
14717 ctx.fmtpos--;
14718 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014719 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014720
Victor Stinnercfc4c132013-04-03 01:48:39 +020014721 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14722 nonfmtpos, ctx.fmtpos) < 0)
14723 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014724 }
14725 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014726 ctx.fmtpos++;
14727 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014728 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014729 }
14730 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014731
Victor Stinnera47082312012-10-04 02:19:54 +020014732 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014733 PyErr_SetString(PyExc_TypeError,
14734 "not all arguments converted during string formatting");
14735 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014736 }
14737
Victor Stinnera47082312012-10-04 02:19:54 +020014738 if (ctx.args_owned) {
14739 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014740 }
Victor Stinnera47082312012-10-04 02:19:54 +020014741 Py_DECREF(ctx.fmtstr);
14742 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014743
Benjamin Peterson29060642009-01-31 22:14:21 +000014744 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014745 Py_DECREF(ctx.fmtstr);
14746 _PyUnicodeWriter_Dealloc(&ctx.writer);
14747 if (ctx.args_owned) {
14748 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014749 }
14750 return NULL;
14751}
14752
Jeremy Hylton938ace62002-07-17 16:30:39 +000014753static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014754unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14755
Tim Peters6d6c1a32001-08-02 04:15:00 +000014756static PyObject *
14757unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14758{
Benjamin Peterson29060642009-01-31 22:14:21 +000014759 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014760 static char *kwlist[] = {"object", "encoding", "errors", 0};
14761 char *encoding = NULL;
14762 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014763
Benjamin Peterson14339b62009-01-31 16:36:08 +000014764 if (type != &PyUnicode_Type)
14765 return unicode_subtype_new(type, args, kwds);
14766 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014767 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014768 return NULL;
14769 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014770 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014771 if (encoding == NULL && errors == NULL)
14772 return PyObject_Str(x);
14773 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014774 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014775}
14776
Guido van Rossume023fe02001-08-30 03:12:59 +000014777static PyObject *
14778unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14779{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014780 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014781 Py_ssize_t length, char_size;
14782 int share_wstr, share_utf8;
14783 unsigned int kind;
14784 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014785
Benjamin Peterson14339b62009-01-31 16:36:08 +000014786 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014787
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014788 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014789 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014790 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014791 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014792 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014793 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014794 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014795 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014796
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014797 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014798 if (self == NULL) {
14799 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014800 return NULL;
14801 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014802 kind = PyUnicode_KIND(unicode);
14803 length = PyUnicode_GET_LENGTH(unicode);
14804
14805 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014806#ifdef Py_DEBUG
14807 _PyUnicode_HASH(self) = -1;
14808#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014809 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014810#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014811 _PyUnicode_STATE(self).interned = 0;
14812 _PyUnicode_STATE(self).kind = kind;
14813 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014814 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014815 _PyUnicode_STATE(self).ready = 1;
14816 _PyUnicode_WSTR(self) = NULL;
14817 _PyUnicode_UTF8_LENGTH(self) = 0;
14818 _PyUnicode_UTF8(self) = NULL;
14819 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014820 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014821
14822 share_utf8 = 0;
14823 share_wstr = 0;
14824 if (kind == PyUnicode_1BYTE_KIND) {
14825 char_size = 1;
14826 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14827 share_utf8 = 1;
14828 }
14829 else if (kind == PyUnicode_2BYTE_KIND) {
14830 char_size = 2;
14831 if (sizeof(wchar_t) == 2)
14832 share_wstr = 1;
14833 }
14834 else {
14835 assert(kind == PyUnicode_4BYTE_KIND);
14836 char_size = 4;
14837 if (sizeof(wchar_t) == 4)
14838 share_wstr = 1;
14839 }
14840
14841 /* Ensure we won't overflow the length. */
14842 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14843 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014844 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014845 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014846 data = PyObject_MALLOC((length + 1) * char_size);
14847 if (data == NULL) {
14848 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014849 goto onError;
14850 }
14851
Victor Stinnerc3c74152011-10-02 20:39:55 +020014852 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014853 if (share_utf8) {
14854 _PyUnicode_UTF8_LENGTH(self) = length;
14855 _PyUnicode_UTF8(self) = data;
14856 }
14857 if (share_wstr) {
14858 _PyUnicode_WSTR_LENGTH(self) = length;
14859 _PyUnicode_WSTR(self) = (wchar_t *)data;
14860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014861
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014862 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014863 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014864 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014865#ifdef Py_DEBUG
14866 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14867#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014868 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014869 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014870
14871onError:
14872 Py_DECREF(unicode);
14873 Py_DECREF(self);
14874 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014875}
14876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014877PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014878"str(object='') -> str\n\
14879str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014880\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014881Create a new string object from the given object. If encoding or\n\
14882errors is specified, then the object must expose a data buffer\n\
14883that will be decoded using the given encoding and error handler.\n\
14884Otherwise, returns the result of object.__str__() (if defined)\n\
14885or repr(object).\n\
14886encoding defaults to sys.getdefaultencoding().\n\
14887errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014888
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014889static PyObject *unicode_iter(PyObject *seq);
14890
Guido van Rossumd57fd912000-03-10 22:53:23 +000014891PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014892 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014893 "str", /* tp_name */
14894 sizeof(PyUnicodeObject), /* tp_size */
14895 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014896 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014897 (destructor)unicode_dealloc, /* tp_dealloc */
14898 0, /* tp_print */
14899 0, /* tp_getattr */
14900 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014901 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014902 unicode_repr, /* tp_repr */
14903 &unicode_as_number, /* tp_as_number */
14904 &unicode_as_sequence, /* tp_as_sequence */
14905 &unicode_as_mapping, /* tp_as_mapping */
14906 (hashfunc) unicode_hash, /* tp_hash*/
14907 0, /* tp_call*/
14908 (reprfunc) unicode_str, /* tp_str */
14909 PyObject_GenericGetAttr, /* tp_getattro */
14910 0, /* tp_setattro */
14911 0, /* tp_as_buffer */
14912 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014913 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014914 unicode_doc, /* tp_doc */
14915 0, /* tp_traverse */
14916 0, /* tp_clear */
14917 PyUnicode_RichCompare, /* tp_richcompare */
14918 0, /* tp_weaklistoffset */
14919 unicode_iter, /* tp_iter */
14920 0, /* tp_iternext */
14921 unicode_methods, /* tp_methods */
14922 0, /* tp_members */
14923 0, /* tp_getset */
14924 &PyBaseObject_Type, /* tp_base */
14925 0, /* tp_dict */
14926 0, /* tp_descr_get */
14927 0, /* tp_descr_set */
14928 0, /* tp_dictoffset */
14929 0, /* tp_init */
14930 0, /* tp_alloc */
14931 unicode_new, /* tp_new */
14932 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014933};
14934
14935/* Initialize the Unicode implementation */
14936
Victor Stinner3a50e702011-10-18 21:21:00 +020014937int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014938{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014939 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014940 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014941 0x000A, /* LINE FEED */
14942 0x000D, /* CARRIAGE RETURN */
14943 0x001C, /* FILE SEPARATOR */
14944 0x001D, /* GROUP SEPARATOR */
14945 0x001E, /* RECORD SEPARATOR */
14946 0x0085, /* NEXT LINE */
14947 0x2028, /* LINE SEPARATOR */
14948 0x2029, /* PARAGRAPH SEPARATOR */
14949 };
14950
Fred Drakee4315f52000-05-09 19:53:39 +000014951 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014952 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014953 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014954 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014955 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014956
Guido van Rossumcacfc072002-05-24 19:01:59 +000014957 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014958 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014959
14960 /* initialize the linebreak bloom filter */
14961 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014962 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014963 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014964
Christian Heimes26532f72013-07-20 14:57:16 +020014965 if (PyType_Ready(&EncodingMapType) < 0)
14966 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014967
Benjamin Petersonc4311282012-10-30 23:21:10 -040014968 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14969 Py_FatalError("Can't initialize field name iterator type");
14970
14971 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14972 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014973
Victor Stinner3a50e702011-10-18 21:21:00 +020014974#ifdef HAVE_MBCS
14975 winver.dwOSVersionInfoSize = sizeof(winver);
14976 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14977 PyErr_SetFromWindowsErr(0);
14978 return -1;
14979 }
14980#endif
14981 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014982}
14983
14984/* Finalize the Unicode implementation */
14985
Christian Heimesa156e092008-02-16 07:38:31 +000014986int
14987PyUnicode_ClearFreeList(void)
14988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014989 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014990}
14991
Guido van Rossumd57fd912000-03-10 22:53:23 +000014992void
Thomas Wouters78890102000-07-22 19:25:51 +000014993_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014994{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014995 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014996
Serhiy Storchaka05997252013-01-26 12:14:02 +020014997 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014998
Serhiy Storchaka05997252013-01-26 12:14:02 +020014999 for (i = 0; i < 256; i++)
15000 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015001 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015002 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015003}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015004
Walter Dörwald16807132007-05-25 13:52:07 +000015005void
15006PyUnicode_InternInPlace(PyObject **p)
15007{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015008 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015009 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015010#ifdef Py_DEBUG
15011 assert(s != NULL);
15012 assert(_PyUnicode_CHECK(s));
15013#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015014 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015015 return;
15016#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015017 /* If it's a subclass, we don't really know what putting
15018 it in the interned dict might do. */
15019 if (!PyUnicode_CheckExact(s))
15020 return;
15021 if (PyUnicode_CHECK_INTERNED(s))
15022 return;
15023 if (interned == NULL) {
15024 interned = PyDict_New();
15025 if (interned == NULL) {
15026 PyErr_Clear(); /* Don't leave an exception */
15027 return;
15028 }
15029 }
15030 /* It might be that the GetItem call fails even
15031 though the key is present in the dictionary,
15032 namely when this happens during a stack overflow. */
15033 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015034 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015035 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015036
Victor Stinnerf0335102013-04-14 19:13:03 +020015037 if (t) {
15038 Py_INCREF(t);
15039 Py_DECREF(*p);
15040 *p = t;
15041 return;
15042 }
Walter Dörwald16807132007-05-25 13:52:07 +000015043
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015045 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 PyErr_Clear();
15047 PyThreadState_GET()->recursion_critical = 0;
15048 return;
15049 }
15050 PyThreadState_GET()->recursion_critical = 0;
15051 /* The two references in interned are not counted by refcnt.
15052 The deallocator will take care of this */
15053 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015054 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015055}
15056
15057void
15058PyUnicode_InternImmortal(PyObject **p)
15059{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 PyUnicode_InternInPlace(p);
15061 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015062 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015063 Py_INCREF(*p);
15064 }
Walter Dörwald16807132007-05-25 13:52:07 +000015065}
15066
15067PyObject *
15068PyUnicode_InternFromString(const char *cp)
15069{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015070 PyObject *s = PyUnicode_FromString(cp);
15071 if (s == NULL)
15072 return NULL;
15073 PyUnicode_InternInPlace(&s);
15074 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015075}
15076
Alexander Belopolsky40018472011-02-26 01:02:56 +000015077void
15078_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015079{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015081 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015082 Py_ssize_t i, n;
15083 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015084
Benjamin Peterson14339b62009-01-31 16:36:08 +000015085 if (interned == NULL || !PyDict_Check(interned))
15086 return;
15087 keys = PyDict_Keys(interned);
15088 if (keys == NULL || !PyList_Check(keys)) {
15089 PyErr_Clear();
15090 return;
15091 }
Walter Dörwald16807132007-05-25 13:52:07 +000015092
Benjamin Peterson14339b62009-01-31 16:36:08 +000015093 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15094 detector, interned unicode strings are not forcibly deallocated;
15095 rather, we give them their stolen references back, and then clear
15096 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015097
Benjamin Peterson14339b62009-01-31 16:36:08 +000015098 n = PyList_GET_SIZE(keys);
15099 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015100 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015101 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015102 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015103 if (PyUnicode_READY(s) == -1) {
15104 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015105 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015107 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015108 case SSTATE_NOT_INTERNED:
15109 /* XXX Shouldn't happen */
15110 break;
15111 case SSTATE_INTERNED_IMMORTAL:
15112 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015113 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015114 break;
15115 case SSTATE_INTERNED_MORTAL:
15116 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015117 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015118 break;
15119 default:
15120 Py_FatalError("Inconsistent interned string state.");
15121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015122 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015123 }
15124 fprintf(stderr, "total size of all interned strings: "
15125 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15126 "mortal/immortal\n", mortal_size, immortal_size);
15127 Py_DECREF(keys);
15128 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015129 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015130}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015131
15132
15133/********************* Unicode Iterator **************************/
15134
15135typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015136 PyObject_HEAD
15137 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015138 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015139} unicodeiterobject;
15140
15141static void
15142unicodeiter_dealloc(unicodeiterobject *it)
15143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 _PyObject_GC_UNTRACK(it);
15145 Py_XDECREF(it->it_seq);
15146 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015147}
15148
15149static int
15150unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015152 Py_VISIT(it->it_seq);
15153 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015154}
15155
15156static PyObject *
15157unicodeiter_next(unicodeiterobject *it)
15158{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015159 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015160
Benjamin Peterson14339b62009-01-31 16:36:08 +000015161 assert(it != NULL);
15162 seq = it->it_seq;
15163 if (seq == NULL)
15164 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015165 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015167 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15168 int kind = PyUnicode_KIND(seq);
15169 void *data = PyUnicode_DATA(seq);
15170 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15171 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 if (item != NULL)
15173 ++it->it_index;
15174 return item;
15175 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015176
Benjamin Peterson14339b62009-01-31 16:36:08 +000015177 Py_DECREF(seq);
15178 it->it_seq = NULL;
15179 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015180}
15181
15182static PyObject *
15183unicodeiter_len(unicodeiterobject *it)
15184{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015185 Py_ssize_t len = 0;
15186 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015187 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015188 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015189}
15190
15191PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15192
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015193static PyObject *
15194unicodeiter_reduce(unicodeiterobject *it)
15195{
15196 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015197 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015198 it->it_seq, it->it_index);
15199 } else {
15200 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15201 if (u == NULL)
15202 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015203 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015204 }
15205}
15206
15207PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15208
15209static PyObject *
15210unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15211{
15212 Py_ssize_t index = PyLong_AsSsize_t(state);
15213 if (index == -1 && PyErr_Occurred())
15214 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015215 if (it->it_seq != NULL) {
15216 if (index < 0)
15217 index = 0;
15218 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15219 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15220 it->it_index = index;
15221 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015222 Py_RETURN_NONE;
15223}
15224
15225PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15226
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015227static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015228 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015229 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015230 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15231 reduce_doc},
15232 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15233 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015234 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015235};
15236
15237PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015238 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15239 "str_iterator", /* tp_name */
15240 sizeof(unicodeiterobject), /* tp_basicsize */
15241 0, /* tp_itemsize */
15242 /* methods */
15243 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15244 0, /* tp_print */
15245 0, /* tp_getattr */
15246 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015247 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 0, /* tp_repr */
15249 0, /* tp_as_number */
15250 0, /* tp_as_sequence */
15251 0, /* tp_as_mapping */
15252 0, /* tp_hash */
15253 0, /* tp_call */
15254 0, /* tp_str */
15255 PyObject_GenericGetAttr, /* tp_getattro */
15256 0, /* tp_setattro */
15257 0, /* tp_as_buffer */
15258 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15259 0, /* tp_doc */
15260 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15261 0, /* tp_clear */
15262 0, /* tp_richcompare */
15263 0, /* tp_weaklistoffset */
15264 PyObject_SelfIter, /* tp_iter */
15265 (iternextfunc)unicodeiter_next, /* tp_iternext */
15266 unicodeiter_methods, /* tp_methods */
15267 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015268};
15269
15270static PyObject *
15271unicode_iter(PyObject *seq)
15272{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015274
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 if (!PyUnicode_Check(seq)) {
15276 PyErr_BadInternalCall();
15277 return NULL;
15278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015279 if (PyUnicode_READY(seq) == -1)
15280 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15282 if (it == NULL)
15283 return NULL;
15284 it->it_index = 0;
15285 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015286 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015287 _PyObject_GC_TRACK(it);
15288 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015289}
15290
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015291
15292size_t
15293Py_UNICODE_strlen(const Py_UNICODE *u)
15294{
15295 int res = 0;
15296 while(*u++)
15297 res++;
15298 return res;
15299}
15300
15301Py_UNICODE*
15302Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15303{
15304 Py_UNICODE *u = s1;
15305 while ((*u++ = *s2++));
15306 return s1;
15307}
15308
15309Py_UNICODE*
15310Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15311{
15312 Py_UNICODE *u = s1;
15313 while ((*u++ = *s2++))
15314 if (n-- == 0)
15315 break;
15316 return s1;
15317}
15318
15319Py_UNICODE*
15320Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15321{
15322 Py_UNICODE *u1 = s1;
15323 u1 += Py_UNICODE_strlen(u1);
15324 Py_UNICODE_strcpy(u1, s2);
15325 return s1;
15326}
15327
15328int
15329Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15330{
15331 while (*s1 && *s2 && *s1 == *s2)
15332 s1++, s2++;
15333 if (*s1 && *s2)
15334 return (*s1 < *s2) ? -1 : +1;
15335 if (*s1)
15336 return 1;
15337 if (*s2)
15338 return -1;
15339 return 0;
15340}
15341
15342int
15343Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15344{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015345 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015346 for (; n != 0; n--) {
15347 u1 = *s1;
15348 u2 = *s2;
15349 if (u1 != u2)
15350 return (u1 < u2) ? -1 : +1;
15351 if (u1 == '\0')
15352 return 0;
15353 s1++;
15354 s2++;
15355 }
15356 return 0;
15357}
15358
15359Py_UNICODE*
15360Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15361{
15362 const Py_UNICODE *p;
15363 for (p = s; *p; p++)
15364 if (*p == c)
15365 return (Py_UNICODE*)p;
15366 return NULL;
15367}
15368
15369Py_UNICODE*
15370Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15371{
15372 const Py_UNICODE *p;
15373 p = s + Py_UNICODE_strlen(s);
15374 while (p != s) {
15375 p--;
15376 if (*p == c)
15377 return (Py_UNICODE*)p;
15378 }
15379 return NULL;
15380}
Victor Stinner331ea922010-08-10 16:37:20 +000015381
Victor Stinner71133ff2010-09-01 23:43:53 +000015382Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015383PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015384{
Victor Stinner577db2c2011-10-11 22:12:48 +020015385 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015386 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015388 if (!PyUnicode_Check(unicode)) {
15389 PyErr_BadArgument();
15390 return NULL;
15391 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015392 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015393 if (u == NULL)
15394 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015395 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015396 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015397 PyErr_NoMemory();
15398 return NULL;
15399 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015400 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015401 size *= sizeof(Py_UNICODE);
15402 copy = PyMem_Malloc(size);
15403 if (copy == NULL) {
15404 PyErr_NoMemory();
15405 return NULL;
15406 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015407 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015408 return copy;
15409}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015410
Georg Brandl66c221e2010-10-14 07:04:07 +000015411/* A _string module, to export formatter_parser and formatter_field_name_split
15412 to the string.Formatter class implemented in Python. */
15413
15414static PyMethodDef _string_methods[] = {
15415 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15416 METH_O, PyDoc_STR("split the argument as a field name")},
15417 {"formatter_parser", (PyCFunction) formatter_parser,
15418 METH_O, PyDoc_STR("parse the argument as a format string")},
15419 {NULL, NULL}
15420};
15421
15422static struct PyModuleDef _string_module = {
15423 PyModuleDef_HEAD_INIT,
15424 "_string",
15425 PyDoc_STR("string helper module"),
15426 0,
15427 _string_methods,
15428 NULL,
15429 NULL,
15430 NULL,
15431 NULL
15432};
15433
15434PyMODINIT_FUNC
15435PyInit__string(void)
15436{
15437 return PyModule_Create(&_string_module);
15438}
15439
15440
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015441#ifdef __cplusplus
15442}
15443#endif