blob: 067a945b0552caee832427a0039c6dd72c9ac49d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001014 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1015
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 if (ascii->wstr == data)
1017 printf("shared ");
1018 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001019
Victor Stinnera3b334d2011-10-03 13:53:37 +02001020 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(" (%zu), ", compact->wstr_length);
1022 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1023 printf("shared ");
1024 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
1373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
1375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001536 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1537 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyErr_NoMemory();
1539 return -1;
1540 }
1541 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1542 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001543 _PyUnicode_UTF8(unicode) = NULL;
1544 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001545 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1546 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001547 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 PyObject_FREE(_PyUnicode_WSTR(unicode));
1549 _PyUnicode_WSTR(unicode) = NULL;
1550 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1551#else
1552 assert(num_surrogates == 0);
1553
Victor Stinnerc3c74152011-10-02 20:39:55 +02001554 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001556 _PyUnicode_UTF8(unicode) = NULL;
1557 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1559#endif
1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1561 }
1562 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001563 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 return 0;
1565}
1566
Alexander Belopolsky40018472011-02-26 01:02:56 +00001567static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001568unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569{
Walter Dörwald16807132007-05-25 13:52:07 +00001570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 case SSTATE_NOT_INTERNED:
1572 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 case SSTATE_INTERNED_MORTAL:
1575 /* revive dead object temporarily for DelItem */
1576 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001577 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 Py_FatalError(
1579 "deletion of interned string failed");
1580 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_INTERNED_IMMORTAL:
1583 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 default:
1586 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001587 }
1588
Victor Stinner03490912011-10-03 23:45:12 +02001589 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001591 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001592 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001593 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1594 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001596 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597}
1598
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001599#ifdef Py_DEBUG
1600static int
1601unicode_is_singleton(PyObject *unicode)
1602{
1603 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1604 if (unicode == unicode_empty)
1605 return 1;
1606 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1607 {
1608 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1609 if (ch < 256 && unicode_latin1[ch] == unicode)
1610 return 1;
1611 }
1612 return 0;
1613}
1614#endif
1615
Alexander Belopolsky40018472011-02-26 01:02:56 +00001616static int
Victor Stinner488fa492011-12-12 00:01:39 +01001617unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618{
Victor Stinner488fa492011-12-12 00:01:39 +01001619 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (Py_REFCNT(unicode) != 1)
1621 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001622 if (_PyUnicode_HASH(unicode) != -1)
1623 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (PyUnicode_CHECK_INTERNED(unicode))
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (!PyUnicode_CheckExact(unicode))
1627 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001628#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001629 /* singleton refcount is greater than 1 */
1630 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001631#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001632 return 1;
1633}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635static int
1636unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1637{
1638 PyObject *unicode;
1639 Py_ssize_t old_length;
1640
1641 assert(p_unicode != NULL);
1642 unicode = *p_unicode;
1643
1644 assert(unicode != NULL);
1645 assert(PyUnicode_Check(unicode));
1646 assert(0 <= length);
1647
Victor Stinner910337b2011-10-03 03:20:16 +02001648 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001649 old_length = PyUnicode_WSTR_LENGTH(unicode);
1650 else
1651 old_length = PyUnicode_GET_LENGTH(unicode);
1652 if (old_length == length)
1653 return 0;
1654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001656 _Py_INCREF_UNICODE_EMPTY();
1657 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001658 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 Py_DECREF(*p_unicode);
1660 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 return 0;
1662 }
1663
Victor Stinner488fa492011-12-12 00:01:39 +01001664 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 PyObject *copy = resize_copy(unicode, length);
1666 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001667 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 Py_DECREF(*p_unicode);
1669 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
1672
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001674 PyObject *new_unicode = resize_compact(unicode, length);
1675 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001677 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001679 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001680 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001681}
1682
Alexander Belopolsky40018472011-02-26 01:02:56 +00001683int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001684PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001685{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686 PyObject *unicode;
1687 if (p_unicode == NULL) {
1688 PyErr_BadInternalCall();
1689 return -1;
1690 }
1691 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693 {
1694 PyErr_BadInternalCall();
1695 return -1;
1696 }
1697 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001698}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001699
Victor Stinnerc5166102012-02-22 13:55:02 +01001700/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001701
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001702 WARNING: The function doesn't copy the terminating null character and
1703 doesn't check the maximum character (may write a latin1 character in an
1704 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001705static void
1706unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1707 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001708{
1709 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1710 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001711 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001712
1713 switch (kind) {
1714 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001715 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001716#ifdef Py_DEBUG
1717 if (PyUnicode_IS_ASCII(unicode)) {
1718 Py_UCS4 maxchar = ucs1lib_find_max_char(
1719 (const Py_UCS1*)str,
1720 (const Py_UCS1*)str + len);
1721 assert(maxchar < 128);
1722 }
1723#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001724 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001725 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001726 }
1727 case PyUnicode_2BYTE_KIND: {
1728 Py_UCS2 *start = (Py_UCS2 *)data + index;
1729 Py_UCS2 *ucs2 = start;
1730 assert(index <= PyUnicode_GET_LENGTH(unicode));
1731
Victor Stinner184252a2012-06-16 02:57:41 +02001732 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 *ucs2 = (Py_UCS2)*str;
1734
1735 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001736 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 }
1738 default: {
1739 Py_UCS4 *start = (Py_UCS4 *)data + index;
1740 Py_UCS4 *ucs4 = start;
1741 assert(kind == PyUnicode_4BYTE_KIND);
1742 assert(index <= PyUnicode_GET_LENGTH(unicode));
1743
Victor Stinner184252a2012-06-16 02:57:41 +02001744 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001745 *ucs4 = (Py_UCS4)*str;
1746
1747 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001748 }
1749 }
1750}
1751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752static PyObject*
1753get_latin1_char(unsigned char ch)
1754{
Victor Stinnera464fc12011-10-02 20:39:30 +02001755 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode)
1759 return NULL;
1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 unicode_latin1[ch] = unicode;
1763 }
1764 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001765 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Victor Stinner985a82a2014-01-03 12:53:47 +01001768static PyObject*
1769unicode_char(Py_UCS4 ch)
1770{
1771 PyObject *unicode;
1772
1773 assert(ch <= MAX_UNICODE);
1774
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001775 if (ch < 256)
1776 return get_latin1_char(ch);
1777
Victor Stinner985a82a2014-01-03 12:53:47 +01001778 unicode = PyUnicode_New(1, ch);
1779 if (unicode == NULL)
1780 return NULL;
1781 switch (PyUnicode_KIND(unicode)) {
1782 case PyUnicode_1BYTE_KIND:
1783 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1784 break;
1785 case PyUnicode_2BYTE_KIND:
1786 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1787 break;
1788 default:
1789 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1790 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1791 }
1792 assert(_PyUnicode_CheckConsistency(unicode, 1));
1793 return unicode;
1794}
1795
Alexander Belopolsky40018472011-02-26 01:02:56 +00001796PyObject *
1797PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001799 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 Py_UCS4 maxchar = 0;
1801 Py_ssize_t num_surrogates;
1802
1803 if (u == NULL)
1804 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001806 /* If the Unicode data is known at construction time, we can apply
1807 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001810 if (size == 0)
1811 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Single character Unicode objects in the Latin-1 range are
1814 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001815 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 return get_latin1_char((unsigned char)*u);
1817
1818 /* If not empty and not single character, copy the Unicode data
1819 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 if (find_maxchar_surrogates(u, u + size,
1821 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 return NULL;
1823
Victor Stinner8faf8212011-12-08 22:14:11 +01001824 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 if (!unicode)
1826 return NULL;
1827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 switch (PyUnicode_KIND(unicode)) {
1829 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001830 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1832 break;
1833 case PyUnicode_2BYTE_KIND:
1834#if Py_UNICODE_SIZE == 2
1835 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1836#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001837 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1839#endif
1840 break;
1841 case PyUnicode_4BYTE_KIND:
1842#if SIZEOF_WCHAR_T == 2
1843 /* This is the only case which has to process surrogates, thus
1844 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001845 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846#else
1847 assert(num_surrogates == 0);
1848 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1849#endif
1850 break;
1851 default:
1852 assert(0 && "Impossible state");
1853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001855 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856}
1857
Alexander Belopolsky40018472011-02-26 01:02:56 +00001858PyObject *
1859PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001860{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001861 if (size < 0) {
1862 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001864 return NULL;
1865 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001866 if (u != NULL)
1867 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1868 else
1869 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001870}
1871
Alexander Belopolsky40018472011-02-26 01:02:56 +00001872PyObject *
1873PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001874{
1875 size_t size = strlen(u);
1876 if (size > PY_SSIZE_T_MAX) {
1877 PyErr_SetString(PyExc_OverflowError, "input too long");
1878 return NULL;
1879 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001880 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001881}
1882
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001883PyObject *
1884_PyUnicode_FromId(_Py_Identifier *id)
1885{
1886 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001887 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1888 strlen(id->string),
1889 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001890 if (!id->object)
1891 return NULL;
1892 PyUnicode_InternInPlace(&id->object);
1893 assert(!id->next);
1894 id->next = static_strings;
1895 static_strings = id;
1896 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001897 return id->object;
1898}
1899
1900void
1901_PyUnicode_ClearStaticStrings()
1902{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001903 _Py_Identifier *tmp, *s = static_strings;
1904 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001905 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001906 tmp = s->next;
1907 s->next = NULL;
1908 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001909 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001911}
1912
Benjamin Peterson0df54292012-03-26 14:50:32 -04001913/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914
Victor Stinnerd3f08822012-05-29 12:57:52 +02001915PyObject*
1916_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001917{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001918 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001919 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001920 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001921#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001922 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001923#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001924 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001925 }
Victor Stinner785938e2011-12-11 20:09:03 +01001926 unicode = PyUnicode_New(size, 127);
1927 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001928 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001929 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1930 assert(_PyUnicode_CheckConsistency(unicode, 1));
1931 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001932}
1933
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001934static Py_UCS4
1935kind_maxchar_limit(unsigned int kind)
1936{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001937 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001938 case PyUnicode_1BYTE_KIND:
1939 return 0x80;
1940 case PyUnicode_2BYTE_KIND:
1941 return 0x100;
1942 case PyUnicode_4BYTE_KIND:
1943 return 0x10000;
1944 default:
1945 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001946 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001947 }
1948}
1949
Victor Stinnere6abb482012-05-02 01:15:40 +02001950Py_LOCAL_INLINE(Py_UCS4)
1951align_maxchar(Py_UCS4 maxchar)
1952{
1953 if (maxchar <= 127)
1954 return 127;
1955 else if (maxchar <= 255)
1956 return 255;
1957 else if (maxchar <= 65535)
1958 return 65535;
1959 else
1960 return MAX_UNICODE;
1961}
1962
Victor Stinner702c7342011-10-05 13:50:52 +02001963static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001964_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001967 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001968
Serhiy Storchaka678db842013-01-26 12:16:36 +02001969 if (size == 0)
1970 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001972 if (size == 1)
1973 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001975 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001976 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 if (!res)
1978 return NULL;
1979 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001982}
1983
Victor Stinnere57b1c02011-09-28 22:20:48 +02001984static PyObject*
1985_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986{
1987 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001988 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989
Serhiy Storchaka678db842013-01-26 12:16:36 +02001990 if (size == 0)
1991 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001992 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001993 if (size == 1)
1994 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001995
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001996 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001997 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 if (!res)
1999 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002000 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002002 else {
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2005 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002006 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return res;
2008}
2009
Victor Stinnere57b1c02011-09-28 22:20:48 +02002010static PyObject*
2011_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012{
2013 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002014 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002015
Serhiy Storchaka678db842013-01-26 12:16:36 +02002016 if (size == 0)
2017 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002019 if (size == 1)
2020 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002021
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002022 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002023 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 if (!res)
2025 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002026 if (max_char < 256)
2027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2028 PyUnicode_1BYTE_DATA(res));
2029 else if (max_char < 0x10000)
2030 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2031 PyUnicode_2BYTE_DATA(res));
2032 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002034 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return res;
2036}
2037
2038PyObject*
2039PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2040{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002041 if (size < 0) {
2042 PyErr_SetString(PyExc_ValueError, "size must be positive");
2043 return NULL;
2044 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002045 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002047 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002052 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002053 PyErr_SetString(PyExc_SystemError, "invalid kind");
2054 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056}
2057
Victor Stinnerece58de2012-04-23 23:36:38 +02002058Py_UCS4
2059_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2060{
2061 enum PyUnicode_Kind kind;
2062 void *startptr, *endptr;
2063
2064 assert(PyUnicode_IS_READY(unicode));
2065 assert(0 <= start);
2066 assert(end <= PyUnicode_GET_LENGTH(unicode));
2067 assert(start <= end);
2068
2069 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2070 return PyUnicode_MAX_CHAR_VALUE(unicode);
2071
2072 if (start == end)
2073 return 127;
2074
Victor Stinner94d558b2012-04-27 22:26:58 +02002075 if (PyUnicode_IS_ASCII(unicode))
2076 return 127;
2077
Victor Stinnerece58de2012-04-23 23:36:38 +02002078 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002079 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002080 endptr = (char *)startptr + end * kind;
2081 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002082 switch(kind) {
2083 case PyUnicode_1BYTE_KIND:
2084 return ucs1lib_find_max_char(startptr, endptr);
2085 case PyUnicode_2BYTE_KIND:
2086 return ucs2lib_find_max_char(startptr, endptr);
2087 case PyUnicode_4BYTE_KIND:
2088 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002090 assert(0);
2091 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002092 }
2093}
2094
Victor Stinner25a4b292011-10-06 12:31:55 +02002095/* Ensure that a string uses the most efficient storage, if it is not the
2096 case: create a new string with of the right kind. Write NULL into *p_unicode
2097 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002098static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002099unicode_adjust_maxchar(PyObject **p_unicode)
2100{
2101 PyObject *unicode, *copy;
2102 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002103 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002104 unsigned int kind;
2105
2106 assert(p_unicode != NULL);
2107 unicode = *p_unicode;
2108 assert(PyUnicode_IS_READY(unicode));
2109 if (PyUnicode_IS_ASCII(unicode))
2110 return;
2111
2112 len = PyUnicode_GET_LENGTH(unicode);
2113 kind = PyUnicode_KIND(unicode);
2114 if (kind == PyUnicode_1BYTE_KIND) {
2115 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002116 max_char = ucs1lib_find_max_char(u, u + len);
2117 if (max_char >= 128)
2118 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 }
2120 else if (kind == PyUnicode_2BYTE_KIND) {
2121 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002122 max_char = ucs2lib_find_max_char(u, u + len);
2123 if (max_char >= 256)
2124 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002125 }
2126 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002127 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002128 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 max_char = ucs4lib_find_max_char(u, u + len);
2130 if (max_char >= 0x10000)
2131 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002134 if (copy != NULL)
2135 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 Py_DECREF(unicode);
2137 *p_unicode = copy;
2138}
2139
Victor Stinner034f6cf2011-09-30 02:26:44 +02002140PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002141_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142{
Victor Stinner87af4f22011-11-21 23:03:47 +01002143 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002144 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145
Victor Stinner034f6cf2011-09-30 02:26:44 +02002146 if (!PyUnicode_Check(unicode)) {
2147 PyErr_BadInternalCall();
2148 return NULL;
2149 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002150 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002151 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152
Victor Stinner87af4f22011-11-21 23:03:47 +01002153 length = PyUnicode_GET_LENGTH(unicode);
2154 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002155 if (!copy)
2156 return NULL;
2157 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2158
Victor Stinner87af4f22011-11-21 23:03:47 +01002159 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2160 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002161 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002162 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002163}
2164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165
Victor Stinnerbc603d12011-10-02 01:00:40 +02002166/* Widen Unicode objects to larger buffers. Don't write terminating null
2167 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168
2169void*
2170_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2171{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 Py_ssize_t len;
2173 void *result;
2174 unsigned int skind;
2175
Benjamin Petersonbac79492012-01-14 13:34:47 -05002176 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002177 return NULL;
2178
2179 len = PyUnicode_GET_LENGTH(s);
2180 skind = PyUnicode_KIND(s);
2181 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002182 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 return NULL;
2184 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002185 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 case PyUnicode_2BYTE_KIND:
2187 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2188 if (!result)
2189 return PyErr_NoMemory();
2190 assert(skind == PyUnicode_1BYTE_KIND);
2191 _PyUnicode_CONVERT_BYTES(
2192 Py_UCS1, Py_UCS2,
2193 PyUnicode_1BYTE_DATA(s),
2194 PyUnicode_1BYTE_DATA(s) + len,
2195 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002197 case PyUnicode_4BYTE_KIND:
2198 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2199 if (!result)
2200 return PyErr_NoMemory();
2201 if (skind == PyUnicode_2BYTE_KIND) {
2202 _PyUnicode_CONVERT_BYTES(
2203 Py_UCS2, Py_UCS4,
2204 PyUnicode_2BYTE_DATA(s),
2205 PyUnicode_2BYTE_DATA(s) + len,
2206 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002208 else {
2209 assert(skind == PyUnicode_1BYTE_KIND);
2210 _PyUnicode_CONVERT_BYTES(
2211 Py_UCS1, Py_UCS4,
2212 PyUnicode_1BYTE_DATA(s),
2213 PyUnicode_1BYTE_DATA(s) + len,
2214 result);
2215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002217 default:
2218 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 }
Victor Stinner01698042011-10-04 00:04:26 +02002220 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 return NULL;
2222}
2223
2224static Py_UCS4*
2225as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2226 int copy_null)
2227{
2228 int kind;
2229 void *data;
2230 Py_ssize_t len, targetlen;
2231 if (PyUnicode_READY(string) == -1)
2232 return NULL;
2233 kind = PyUnicode_KIND(string);
2234 data = PyUnicode_DATA(string);
2235 len = PyUnicode_GET_LENGTH(string);
2236 targetlen = len;
2237 if (copy_null)
2238 targetlen++;
2239 if (!target) {
2240 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2241 PyErr_NoMemory();
2242 return NULL;
2243 }
2244 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2245 if (!target) {
2246 PyErr_NoMemory();
2247 return NULL;
2248 }
2249 }
2250 else {
2251 if (targetsize < targetlen) {
2252 PyErr_Format(PyExc_SystemError,
2253 "string is longer than the buffer");
2254 if (copy_null && 0 < targetsize)
2255 target[0] = 0;
2256 return NULL;
2257 }
2258 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002259 if (kind == PyUnicode_1BYTE_KIND) {
2260 Py_UCS1 *start = (Py_UCS1 *) data;
2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002263 else if (kind == PyUnicode_2BYTE_KIND) {
2264 Py_UCS2 *start = (Py_UCS2 *) data;
2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266 }
2267 else {
2268 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 if (copy_null)
2272 target[len] = 0;
2273 return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278 int copy_null)
2279{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002280 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 PyErr_BadInternalCall();
2282 return NULL;
2283 }
2284 return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290 return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002294
Alexander Belopolsky40018472011-02-26 01:02:56 +00002295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002300 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 PyErr_BadInternalCall();
2302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 }
2304
Martin v. Löwis790465f2008-04-05 20:41:37 +00002305 if (size == -1) {
2306 size = wcslen(w);
2307 }
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002313
Walter Dörwald346737f2007-05-31 10:44:43 +00002314static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002315makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002316 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002317{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002318 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 if (longflag)
2320 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002321 else if (longlongflag) {
2322 /* longlongflag should only ever be nonzero on machines with
2323 HAVE_LONG_LONG defined */
2324#ifdef HAVE_LONG_LONG
2325 char *f = PY_FORMAT_LONG_LONG;
2326 while (*f)
2327 *fmt++ = *f++;
2328#else
2329 /* we shouldn't ever get here */
2330 assert(0);
2331 *fmt++ = 'l';
2332#endif
2333 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002334 else if (size_tflag) {
2335 char *f = PY_FORMAT_SIZE_T;
2336 while (*f)
2337 *fmt++ = *f++;
2338 }
2339 *fmt++ = c;
2340 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002341}
2342
Victor Stinner15a11362012-10-06 23:48:20 +02002343/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002344 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2345 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2346#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002347
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002348static int
2349unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2350 Py_ssize_t width, Py_ssize_t precision)
2351{
2352 Py_ssize_t length, fill, arglen;
2353 Py_UCS4 maxchar;
2354
2355 if (PyUnicode_READY(str) == -1)
2356 return -1;
2357
2358 length = PyUnicode_GET_LENGTH(str);
2359 if ((precision == -1 || precision >= length)
2360 && width <= length)
2361 return _PyUnicodeWriter_WriteStr(writer, str);
2362
2363 if (precision != -1)
2364 length = Py_MIN(precision, length);
2365
2366 arglen = Py_MAX(length, width);
2367 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2368 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2369 else
2370 maxchar = writer->maxchar;
2371
2372 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2373 return -1;
2374
2375 if (width > length) {
2376 fill = width - length;
2377 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2378 return -1;
2379 writer->pos += fill;
2380 }
2381
2382 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2383 str, 0, length);
2384 writer->pos += length;
2385 return 0;
2386}
2387
2388static int
2389unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2390 Py_ssize_t width, Py_ssize_t precision)
2391{
2392 /* UTF-8 */
2393 Py_ssize_t length;
2394 PyObject *unicode;
2395 int res;
2396
2397 length = strlen(str);
2398 if (precision != -1)
2399 length = Py_MIN(length, precision);
2400 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2401 if (unicode == NULL)
2402 return -1;
2403
2404 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2405 Py_DECREF(unicode);
2406 return res;
2407}
2408
Victor Stinner96865452011-03-01 23:44:09 +00002409static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002410unicode_fromformat_arg(_PyUnicodeWriter *writer,
2411 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002412{
Victor Stinnere215d962012-10-06 23:03:36 +02002413 const char *p;
2414 Py_ssize_t len;
2415 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 Py_ssize_t width;
2417 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002418 int longflag;
2419 int longlongflag;
2420 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002421 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002422
2423 p = f;
2424 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002425 zeropad = 0;
2426 if (*f == '0') {
2427 zeropad = 1;
2428 f++;
2429 }
Victor Stinner96865452011-03-01 23:44:09 +00002430
2431 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002432 width = -1;
2433 if (Py_ISDIGIT((unsigned)*f)) {
2434 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002435 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002436 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002437 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002438 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002440 return NULL;
2441 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002442 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002443 f++;
2444 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002445 }
2446 precision = -1;
2447 if (*f == '.') {
2448 f++;
2449 if (Py_ISDIGIT((unsigned)*f)) {
2450 precision = (*f - '0');
2451 f++;
2452 while (Py_ISDIGIT((unsigned)*f)) {
2453 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2454 PyErr_SetString(PyExc_ValueError,
2455 "precision too big");
2456 return NULL;
2457 }
2458 precision = (precision * 10) + (*f - '0');
2459 f++;
2460 }
2461 }
Victor Stinner96865452011-03-01 23:44:09 +00002462 if (*f == '%') {
2463 /* "%.3%s" => f points to "3" */
2464 f--;
2465 }
2466 }
2467 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002468 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002469 f--;
2470 }
Victor Stinner96865452011-03-01 23:44:09 +00002471
2472 /* Handle %ld, %lu, %lld and %llu. */
2473 longflag = 0;
2474 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002475 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002476 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002477 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002478 longflag = 1;
2479 ++f;
2480 }
2481#ifdef HAVE_LONG_LONG
2482 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002483 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002484 longlongflag = 1;
2485 f += 2;
2486 }
2487#endif
2488 }
2489 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002490 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002491 size_tflag = 1;
2492 ++f;
2493 }
Victor Stinnere215d962012-10-06 23:03:36 +02002494
2495 if (f[1] == '\0')
2496 writer->overallocate = 0;
2497
2498 switch (*f) {
2499 case 'c':
2500 {
2501 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002502 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002503 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002504 "character argument not in range(0x110000)");
2505 return NULL;
2506 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002507 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002508 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002509 break;
2510 }
2511
2512 case 'i':
2513 case 'd':
2514 case 'u':
2515 case 'x':
2516 {
2517 /* used by sprintf */
2518 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002519 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002520 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002521
2522 if (*f == 'u') {
2523 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2524
2525 if (longflag)
2526 len = sprintf(buffer, fmt,
2527 va_arg(*vargs, unsigned long));
2528#ifdef HAVE_LONG_LONG
2529 else if (longlongflag)
2530 len = sprintf(buffer, fmt,
2531 va_arg(*vargs, unsigned PY_LONG_LONG));
2532#endif
2533 else if (size_tflag)
2534 len = sprintf(buffer, fmt,
2535 va_arg(*vargs, size_t));
2536 else
2537 len = sprintf(buffer, fmt,
2538 va_arg(*vargs, unsigned int));
2539 }
2540 else if (*f == 'x') {
2541 makefmt(fmt, 0, 0, 0, 'x');
2542 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2543 }
2544 else {
2545 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2546
2547 if (longflag)
2548 len = sprintf(buffer, fmt,
2549 va_arg(*vargs, long));
2550#ifdef HAVE_LONG_LONG
2551 else if (longlongflag)
2552 len = sprintf(buffer, fmt,
2553 va_arg(*vargs, PY_LONG_LONG));
2554#endif
2555 else if (size_tflag)
2556 len = sprintf(buffer, fmt,
2557 va_arg(*vargs, Py_ssize_t));
2558 else
2559 len = sprintf(buffer, fmt,
2560 va_arg(*vargs, int));
2561 }
2562 assert(len >= 0);
2563
Victor Stinnere215d962012-10-06 23:03:36 +02002564 if (precision < len)
2565 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002566
2567 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002568 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2569 return NULL;
2570
Victor Stinnere215d962012-10-06 23:03:36 +02002571 if (width > precision) {
2572 Py_UCS4 fillchar;
2573 fill = width - precision;
2574 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002575 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2576 return NULL;
2577 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002578 }
Victor Stinner15a11362012-10-06 23:48:20 +02002579 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002580 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002581 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2582 return NULL;
2583 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002584 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585
Victor Stinner4a587072013-11-19 12:54:53 +01002586 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2587 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002588 break;
2589 }
2590
2591 case 'p':
2592 {
2593 char number[MAX_LONG_LONG_CHARS];
2594
2595 len = sprintf(number, "%p", va_arg(*vargs, void*));
2596 assert(len >= 0);
2597
2598 /* %p is ill-defined: ensure leading 0x. */
2599 if (number[1] == 'X')
2600 number[1] = 'x';
2601 else if (number[1] != 'x') {
2602 memmove(number + 2, number,
2603 strlen(number) + 1);
2604 number[0] = '0';
2605 number[1] = 'x';
2606 len += 2;
2607 }
2608
Victor Stinner4a587072013-11-19 12:54:53 +01002609 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
2611 break;
2612 }
2613
2614 case 's':
2615 {
2616 /* UTF-8 */
2617 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002618 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002619 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002620 break;
2621 }
2622
2623 case 'U':
2624 {
2625 PyObject *obj = va_arg(*vargs, PyObject *);
2626 assert(obj && _PyUnicode_CHECK(obj));
2627
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002629 return NULL;
2630 break;
2631 }
2632
2633 case 'V':
2634 {
2635 PyObject *obj = va_arg(*vargs, PyObject *);
2636 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002637 if (obj) {
2638 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002639 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002640 return NULL;
2641 }
2642 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002643 assert(str != NULL);
2644 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002645 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002646 }
2647 break;
2648 }
2649
2650 case 'S':
2651 {
2652 PyObject *obj = va_arg(*vargs, PyObject *);
2653 PyObject *str;
2654 assert(obj);
2655 str = PyObject_Str(obj);
2656 if (!str)
2657 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002658 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002659 Py_DECREF(str);
2660 return NULL;
2661 }
2662 Py_DECREF(str);
2663 break;
2664 }
2665
2666 case 'R':
2667 {
2668 PyObject *obj = va_arg(*vargs, PyObject *);
2669 PyObject *repr;
2670 assert(obj);
2671 repr = PyObject_Repr(obj);
2672 if (!repr)
2673 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002674 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002675 Py_DECREF(repr);
2676 return NULL;
2677 }
2678 Py_DECREF(repr);
2679 break;
2680 }
2681
2682 case 'A':
2683 {
2684 PyObject *obj = va_arg(*vargs, PyObject *);
2685 PyObject *ascii;
2686 assert(obj);
2687 ascii = PyObject_ASCII(obj);
2688 if (!ascii)
2689 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002690 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002691 Py_DECREF(ascii);
2692 return NULL;
2693 }
2694 Py_DECREF(ascii);
2695 break;
2696 }
2697
2698 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002699 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002700 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002701 break;
2702
2703 default:
2704 /* if we stumble upon an unknown formatting code, copy the rest
2705 of the format string to the output string. (we cannot just
2706 skip the code, since there's no way to know what's in the
2707 argument list) */
2708 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002709 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002710 return NULL;
2711 f = p+len;
2712 return f;
2713 }
2714
2715 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002716 return f;
2717}
2718
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719PyObject *
2720PyUnicode_FromFormatV(const char *format, va_list vargs)
2721{
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_list vargs2;
2723 const char *f;
2724 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002725
Victor Stinner8f674cc2013-04-17 23:02:17 +02002726 _PyUnicodeWriter_Init(&writer);
2727 writer.min_length = strlen(format) + 100;
2728 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002729
2730 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2731 Copy it to be able to pass a reference to a subfunction. */
2732 Py_VA_COPY(vargs2, vargs);
2733
2734 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002736 f = unicode_fromformat_arg(&writer, f, &vargs2);
2737 if (f == NULL)
2738 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002741 const char *p;
2742 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002743
Victor Stinnere215d962012-10-06 23:03:36 +02002744 p = f;
2745 do
2746 {
2747 if ((unsigned char)*p > 127) {
2748 PyErr_Format(PyExc_ValueError,
2749 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2750 "string, got a non-ASCII byte: 0x%02x",
2751 (unsigned char)*p);
2752 return NULL;
2753 }
2754 p++;
2755 }
2756 while (*p != '\0' && *p != '%');
2757 len = p - f;
2758
2759 if (*p == '\0')
2760 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002761
2762 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002763 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002764
2765 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002767 }
Victor Stinnere215d962012-10-06 23:03:36 +02002768 return _PyUnicodeWriter_Finish(&writer);
2769
2770 fail:
2771 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002773}
2774
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775PyObject *
2776PyUnicode_FromFormat(const char *format, ...)
2777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 PyObject* ret;
2779 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002780
2781#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002783#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 ret = PyUnicode_FromFormatV(format, vargs);
2787 va_end(vargs);
2788 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002789}
2790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791#ifdef HAVE_WCHAR_H
2792
Victor Stinner5593d8a2010-10-02 11:11:27 +00002793/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2794 convert a Unicode object to a wide character string.
2795
Victor Stinnerd88d9832011-09-06 02:00:05 +02002796 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002797 character) required to convert the unicode object. Ignore size argument.
2798
Victor Stinnerd88d9832011-09-06 02:00:05 +02002799 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002800 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002803unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002804 wchar_t *w,
2805 Py_ssize_t size)
2806{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002807 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 const wchar_t *wstr;
2809
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002810 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (wstr == NULL)
2812 return -1;
2813
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002815 if (size > res)
2816 size = res + 1;
2817 else
2818 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002820 return res;
2821 }
2822 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002824}
2825
2826Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002827PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002828 wchar_t *w,
2829 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830{
2831 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 PyErr_BadInternalCall();
2833 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002835 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836}
2837
Victor Stinner137c34c2010-09-29 10:25:54 +00002838wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002839PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002840 Py_ssize_t *size)
2841{
2842 wchar_t* buffer;
2843 Py_ssize_t buflen;
2844
2845 if (unicode == NULL) {
2846 PyErr_BadInternalCall();
2847 return NULL;
2848 }
2849
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002850 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002851 if (buflen == -1)
2852 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002853 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002854 PyErr_NoMemory();
2855 return NULL;
2856 }
2857
Victor Stinner137c34c2010-09-29 10:25:54 +00002858 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2859 if (buffer == NULL) {
2860 PyErr_NoMemory();
2861 return NULL;
2862 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002863 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002864 if (buflen == -1) {
2865 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002866 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002867 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002868 if (size != NULL)
2869 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002870 return buffer;
2871}
2872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002873#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874
Alexander Belopolsky40018472011-02-26 01:02:56 +00002875PyObject *
2876PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002877{
Victor Stinner8faf8212011-12-08 22:14:11 +01002878 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002879 PyErr_SetString(PyExc_ValueError,
2880 "chr() arg not in range(0x110000)");
2881 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002882 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002883
Victor Stinner985a82a2014-01-03 12:53:47 +01002884 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002885}
2886
Alexander Belopolsky40018472011-02-26 01:02:56 +00002887PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002888PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002890 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002893 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002894 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002895 Py_INCREF(obj);
2896 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897 }
2898 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 /* For a Unicode subtype that's not a Unicode object,
2900 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002901 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002902 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002903 PyErr_Format(PyExc_TypeError,
2904 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002905 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002906 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002907}
2908
Alexander Belopolsky40018472011-02-26 01:02:56 +00002909PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002910PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002911 const char *encoding,
2912 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002913{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002914 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002915 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002916
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 PyErr_BadInternalCall();
2919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002921
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002922 /* Decoding bytes objects is the most common case and should be fast */
2923 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002924 if (PyBytes_GET_SIZE(obj) == 0)
2925 _Py_RETURN_UNICODE_EMPTY();
2926 v = PyUnicode_Decode(
2927 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2928 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002929 return v;
2930 }
2931
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002932 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 PyErr_SetString(PyExc_TypeError,
2934 "decoding str is not supported");
2935 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2939 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2940 PyErr_Format(PyExc_TypeError,
2941 "coercing to str: need bytes, bytearray "
2942 "or buffer-like object, %.80s found",
2943 Py_TYPE(obj)->tp_name);
2944 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002945 }
Tim Petersced69f82003-09-16 20:30:58 +00002946
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002947 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002948 PyBuffer_Release(&buffer);
2949 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002951
Serhiy Storchaka05997252013-01-26 12:14:02 +02002952 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002953 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002954 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955}
2956
Victor Stinner600d3be2010-06-10 12:00:55 +00002957/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002958 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2959 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002960int
2961_Py_normalize_encoding(const char *encoding,
2962 char *lower,
2963 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002965 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002966 char *l;
2967 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002969 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002970 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002971 if (lower_len < 6)
2972 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002973 strcpy(lower, "utf-8");
2974 return 1;
2975 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002976 e = encoding;
2977 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002978 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002979 while (*e) {
2980 if (l == l_end)
2981 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002982 if (Py_ISUPPER(*e)) {
2983 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002984 }
2985 else if (*e == '_') {
2986 *l++ = '-';
2987 e++;
2988 }
2989 else {
2990 *l++ = *e++;
2991 }
2992 }
2993 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002994 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002995}
2996
Alexander Belopolsky40018472011-02-26 01:02:56 +00002997PyObject *
2998PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002999 Py_ssize_t size,
3000 const char *encoding,
3001 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003002{
3003 PyObject *buffer = NULL, *unicode;
3004 Py_buffer info;
3005 char lower[11]; /* Enough for any encoding shortcut */
3006
Fred Drakee4315f52000-05-09 19:53:39 +00003007 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003008 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003009 if ((strcmp(lower, "utf-8") == 0) ||
3010 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003011 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003012 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003013 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003014 (strcmp(lower, "iso-8859-1") == 0) ||
3015 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003016 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003017#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003018 else if (strcmp(lower, "mbcs") == 0)
3019 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003020#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003021 else if (strcmp(lower, "ascii") == 0)
3022 return PyUnicode_DecodeASCII(s, size, errors);
3023 else if (strcmp(lower, "utf-16") == 0)
3024 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3025 else if (strcmp(lower, "utf-32") == 0)
3026 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003030 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003031 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003032 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003033 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 if (buffer == NULL)
3035 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003036 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 if (unicode == NULL)
3038 goto onError;
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003041 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3042 "use codecs.decode() to decode to arbitrary types",
3043 encoding,
3044 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 Py_DECREF(unicode);
3046 goto onError;
3047 }
3048 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003049 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003050
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 Py_XDECREF(buffer);
3053 return NULL;
3054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
3057PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003060{
3061 PyObject *v;
3062
3063 if (!PyUnicode_Check(unicode)) {
3064 PyErr_BadArgument();
3065 goto onError;
3066 }
3067
3068 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003070
3071 /* Decode via the codec registry */
3072 v = PyCodec_Decode(unicode, encoding, errors);
3073 if (v == NULL)
3074 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003075 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003078 return NULL;
3079}
3080
Alexander Belopolsky40018472011-02-26 01:02:56 +00003081PyObject *
3082PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003083 const char *encoding,
3084 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003085{
3086 PyObject *v;
3087
3088 if (!PyUnicode_Check(unicode)) {
3089 PyErr_BadArgument();
3090 goto onError;
3091 }
3092
3093 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003095
3096 /* Decode via the codec registry */
3097 v = PyCodec_Decode(unicode, encoding, errors);
3098 if (v == NULL)
3099 goto onError;
3100 if (!PyUnicode_Check(v)) {
3101 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003102 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3103 "use codecs.decode() to decode to arbitrary types",
3104 encoding,
3105 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106 Py_DECREF(v);
3107 goto onError;
3108 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003109 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003112 return NULL;
3113}
3114
Alexander Belopolsky40018472011-02-26 01:02:56 +00003115PyObject *
3116PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003117 Py_ssize_t size,
3118 const char *encoding,
3119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
3121 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003122
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 unicode = PyUnicode_FromUnicode(s, size);
3124 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3127 Py_DECREF(unicode);
3128 return v;
3129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
3132PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003133 const char *encoding,
3134 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003135{
3136 PyObject *v;
3137
3138 if (!PyUnicode_Check(unicode)) {
3139 PyErr_BadArgument();
3140 goto onError;
3141 }
3142
3143 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003145
3146 /* Encode via the codec registry */
3147 v = PyCodec_Encode(unicode, encoding, errors);
3148 if (v == NULL)
3149 goto onError;
3150 return v;
3151
Benjamin Peterson29060642009-01-31 22:14:21 +00003152 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003153 return NULL;
3154}
3155
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156static size_t
3157wcstombs_errorpos(const wchar_t *wstr)
3158{
3159 size_t len;
3160#if SIZEOF_WCHAR_T == 2
3161 wchar_t buf[3];
3162#else
3163 wchar_t buf[2];
3164#endif
3165 char outbuf[MB_LEN_MAX];
3166 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003167
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168#if SIZEOF_WCHAR_T == 2
3169 buf[2] = 0;
3170#else
3171 buf[1] = 0;
3172#endif
3173 start = wstr;
3174 while (*wstr != L'\0')
3175 {
3176 previous = wstr;
3177#if SIZEOF_WCHAR_T == 2
3178 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3179 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3180 {
3181 buf[0] = wstr[0];
3182 buf[1] = wstr[1];
3183 wstr += 2;
3184 }
3185 else {
3186 buf[0] = *wstr;
3187 buf[1] = 0;
3188 wstr++;
3189 }
3190#else
3191 buf[0] = *wstr;
3192 wstr++;
3193#endif
3194 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003195 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003196 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003197 }
3198
3199 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003200 return 0;
3201}
3202
Victor Stinner1b579672011-12-17 05:47:23 +01003203static int
3204locale_error_handler(const char *errors, int *surrogateescape)
3205{
3206 if (errors == NULL) {
3207 *surrogateescape = 0;
3208 return 0;
3209 }
3210
3211 if (strcmp(errors, "strict") == 0) {
3212 *surrogateescape = 0;
3213 return 0;
3214 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003215 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003216 *surrogateescape = 1;
3217 return 0;
3218 }
3219 PyErr_Format(PyExc_ValueError,
3220 "only 'strict' and 'surrogateescape' error handlers "
3221 "are supported, not '%s'",
3222 errors);
3223 return -1;
3224}
3225
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003226PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003227PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003228{
3229 Py_ssize_t wlen, wlen2;
3230 wchar_t *wstr;
3231 PyObject *bytes = NULL;
3232 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003233 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234 PyObject *exc;
3235 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003236 int surrogateescape;
3237
3238 if (locale_error_handler(errors, &surrogateescape) < 0)
3239 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240
3241 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3242 if (wstr == NULL)
3243 return NULL;
3244
3245 wlen2 = wcslen(wstr);
3246 if (wlen2 != wlen) {
3247 PyMem_Free(wstr);
3248 PyErr_SetString(PyExc_TypeError, "embedded null character");
3249 return NULL;
3250 }
3251
3252 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003253 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003254 char *str;
3255
3256 str = _Py_wchar2char(wstr, &error_pos);
3257 if (str == NULL) {
3258 if (error_pos == (size_t)-1) {
3259 PyErr_NoMemory();
3260 PyMem_Free(wstr);
3261 return NULL;
3262 }
3263 else {
3264 goto encode_error;
3265 }
3266 }
3267 PyMem_Free(wstr);
3268
3269 bytes = PyBytes_FromString(str);
3270 PyMem_Free(str);
3271 }
3272 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003273 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003274 size_t len, len2;
3275
3276 len = wcstombs(NULL, wstr, 0);
3277 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003278 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279 goto encode_error;
3280 }
3281
3282 bytes = PyBytes_FromStringAndSize(NULL, len);
3283 if (bytes == NULL) {
3284 PyMem_Free(wstr);
3285 return NULL;
3286 }
3287
3288 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3289 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003290 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003291 goto encode_error;
3292 }
3293 PyMem_Free(wstr);
3294 }
3295 return bytes;
3296
3297encode_error:
3298 errmsg = strerror(errno);
3299 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003300
3301 if (error_pos == (size_t)-1)
3302 error_pos = wcstombs_errorpos(wstr);
3303
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003304 PyMem_Free(wstr);
3305 Py_XDECREF(bytes);
3306
Victor Stinner2f197072011-12-17 07:08:30 +01003307 if (errmsg != NULL) {
3308 size_t errlen;
3309 wstr = _Py_char2wchar(errmsg, &errlen);
3310 if (wstr != NULL) {
3311 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003312 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003313 } else
3314 errmsg = NULL;
3315 }
3316 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003317 reason = PyUnicode_FromString(
3318 "wcstombs() encountered an unencodable "
3319 "wide character");
3320 if (reason == NULL)
3321 return NULL;
3322
3323 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3324 "locale", unicode,
3325 (Py_ssize_t)error_pos,
3326 (Py_ssize_t)(error_pos+1),
3327 reason);
3328 Py_DECREF(reason);
3329 if (exc != NULL) {
3330 PyCodec_StrictErrors(exc);
3331 Py_XDECREF(exc);
3332 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003333 return NULL;
3334}
3335
Victor Stinnerad158722010-10-27 00:25:46 +00003336PyObject *
3337PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003338{
Victor Stinner99b95382011-07-04 14:23:54 +02003339#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003340 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003341#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003343#else
Victor Stinner793b5312011-04-27 00:24:21 +02003344 PyInterpreterState *interp = PyThreadState_GET()->interp;
3345 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3346 cannot use it to encode and decode filenames before it is loaded. Load
3347 the Python codec requires to encode at least its own filename. Use the C
3348 version of the locale codec until the codec registry is initialized and
3349 the Python codec is loaded.
3350
3351 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3352 cannot only rely on it: check also interp->fscodec_initialized for
3353 subinterpreters. */
3354 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003355 return PyUnicode_AsEncodedString(unicode,
3356 Py_FileSystemDefaultEncoding,
3357 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003358 }
3359 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003360 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003361 }
Victor Stinnerad158722010-10-27 00:25:46 +00003362#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003363}
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 const char *encoding,
3368 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369{
3370 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003371 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003372
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 if (!PyUnicode_Check(unicode)) {
3374 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 }
Fred Drakee4315f52000-05-09 19:53:39 +00003377
Fred Drakee4315f52000-05-09 19:53:39 +00003378 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003379 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003380 if ((strcmp(lower, "utf-8") == 0) ||
3381 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003382 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003383 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003384 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003385 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003387 }
Victor Stinner37296e82010-06-10 13:36:23 +00003388 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003389 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003390 (strcmp(lower, "iso-8859-1") == 0) ||
3391 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003393#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003394 else if (strcmp(lower, "mbcs") == 0)
3395 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003396#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003397 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400
3401 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003402 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003404 return NULL;
3405
3406 /* The normal path */
3407 if (PyBytes_Check(v))
3408 return v;
3409
3410 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003411 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003412 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003413 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003414
3415 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003416 "encoder %s returned bytearray instead of bytes; "
3417 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003418 encoding);
3419 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003420 Py_DECREF(v);
3421 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003423
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003424 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3425 Py_DECREF(v);
3426 return b;
3427 }
3428
3429 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003430 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3431 "use codecs.encode() to encode to arbitrary types",
3432 encoding,
3433 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003434 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003435 return NULL;
3436}
3437
Alexander Belopolsky40018472011-02-26 01:02:56 +00003438PyObject *
3439PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003440 const char *encoding,
3441 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003442{
3443 PyObject *v;
3444
3445 if (!PyUnicode_Check(unicode)) {
3446 PyErr_BadArgument();
3447 goto onError;
3448 }
3449
3450 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003451 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003452
3453 /* Encode via the codec registry */
3454 v = PyCodec_Encode(unicode, encoding, errors);
3455 if (v == NULL)
3456 goto onError;
3457 if (!PyUnicode_Check(v)) {
3458 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003459 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3460 "use codecs.encode() to encode to arbitrary types",
3461 encoding,
3462 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003463 Py_DECREF(v);
3464 goto onError;
3465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003467
Benjamin Peterson29060642009-01-31 22:14:21 +00003468 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 return NULL;
3470}
3471
Victor Stinner2f197072011-12-17 07:08:30 +01003472static size_t
3473mbstowcs_errorpos(const char *str, size_t len)
3474{
3475#ifdef HAVE_MBRTOWC
3476 const char *start = str;
3477 mbstate_t mbs;
3478 size_t converted;
3479 wchar_t ch;
3480
3481 memset(&mbs, 0, sizeof mbs);
3482 while (len)
3483 {
3484 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3485 if (converted == 0)
3486 /* Reached end of string */
3487 break;
3488 if (converted == (size_t)-1 || converted == (size_t)-2) {
3489 /* Conversion error or incomplete character */
3490 return str - start;
3491 }
3492 else {
3493 str += converted;
3494 len -= converted;
3495 }
3496 }
3497 /* failed to find the undecodable byte sequence */
3498 return 0;
3499#endif
3500 return 0;
3501}
3502
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003503PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003504PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003505 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506{
3507 wchar_t smallbuf[256];
3508 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3509 wchar_t *wstr;
3510 size_t wlen, wlen2;
3511 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003512 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003513 size_t error_pos;
3514 char *errmsg;
3515 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003516
3517 if (locale_error_handler(errors, &surrogateescape) < 0)
3518 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003519
3520 if (str[len] != '\0' || len != strlen(str)) {
3521 PyErr_SetString(PyExc_TypeError, "embedded null character");
3522 return NULL;
3523 }
3524
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003525 if (surrogateescape) {
3526 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003527 wstr = _Py_char2wchar(str, &wlen);
3528 if (wstr == NULL) {
3529 if (wlen == (size_t)-1)
3530 PyErr_NoMemory();
3531 else
3532 PyErr_SetFromErrno(PyExc_OSError);
3533 return NULL;
3534 }
3535
3536 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003537 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003538 }
3539 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003540 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003541#ifndef HAVE_BROKEN_MBSTOWCS
3542 wlen = mbstowcs(NULL, str, 0);
3543#else
3544 wlen = len;
3545#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003546 if (wlen == (size_t)-1)
3547 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003548 if (wlen+1 <= smallbuf_len) {
3549 wstr = smallbuf;
3550 }
3551 else {
3552 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3553 return PyErr_NoMemory();
3554
3555 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3556 if (!wstr)
3557 return PyErr_NoMemory();
3558 }
3559
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003560 wlen2 = mbstowcs(wstr, str, wlen+1);
3561 if (wlen2 == (size_t)-1) {
3562 if (wstr != smallbuf)
3563 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003564 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003565 }
3566#ifdef HAVE_BROKEN_MBSTOWCS
3567 assert(wlen2 == wlen);
3568#endif
3569 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3570 if (wstr != smallbuf)
3571 PyMem_Free(wstr);
3572 }
3573 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003574
3575decode_error:
3576 errmsg = strerror(errno);
3577 assert(errmsg != NULL);
3578
3579 error_pos = mbstowcs_errorpos(str, len);
3580 if (errmsg != NULL) {
3581 size_t errlen;
3582 wstr = _Py_char2wchar(errmsg, &errlen);
3583 if (wstr != NULL) {
3584 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003585 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003586 } else
3587 errmsg = NULL;
3588 }
3589 if (errmsg == NULL)
3590 reason = PyUnicode_FromString(
3591 "mbstowcs() encountered an invalid multibyte sequence");
3592 if (reason == NULL)
3593 return NULL;
3594
3595 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3596 "locale", str, len,
3597 (Py_ssize_t)error_pos,
3598 (Py_ssize_t)(error_pos+1),
3599 reason);
3600 Py_DECREF(reason);
3601 if (exc != NULL) {
3602 PyCodec_StrictErrors(exc);
3603 Py_XDECREF(exc);
3604 }
3605 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003606}
3607
3608PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003609PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003610{
3611 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003612 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003613}
3614
3615
3616PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003617PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003618 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003619 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3620}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003621
Christian Heimes5894ba72007-11-04 11:43:14 +00003622PyObject*
3623PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3624{
Victor Stinner99b95382011-07-04 14:23:54 +02003625#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003626 return PyUnicode_DecodeMBCS(s, size, NULL);
3627#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003628 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003629#else
Victor Stinner793b5312011-04-27 00:24:21 +02003630 PyInterpreterState *interp = PyThreadState_GET()->interp;
3631 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3632 cannot use it to encode and decode filenames before it is loaded. Load
3633 the Python codec requires to encode at least its own filename. Use the C
3634 version of the locale codec until the codec registry is initialized and
3635 the Python codec is loaded.
3636
3637 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3638 cannot only rely on it: check also interp->fscodec_initialized for
3639 subinterpreters. */
3640 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003641 return PyUnicode_Decode(s, size,
3642 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003643 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003644 }
3645 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003646 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003647 }
Victor Stinnerad158722010-10-27 00:25:46 +00003648#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003649}
3650
Martin v. Löwis011e8422009-05-05 04:43:17 +00003651
3652int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003653_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003654{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003655 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003656
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003657 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003658 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003659 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3660 PyUnicode_GET_LENGTH(str), '\0', 1);
3661 if (pos == -1)
3662 return 0;
3663 else
3664 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003665}
3666
Antoine Pitrou13348842012-01-29 18:36:34 +01003667int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003668PyUnicode_FSConverter(PyObject* arg, void* addr)
3669{
3670 PyObject *output = NULL;
3671 Py_ssize_t size;
3672 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003673 if (arg == NULL) {
3674 Py_DECREF(*(PyObject**)addr);
3675 return 1;
3676 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003677 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003678 output = arg;
3679 Py_INCREF(output);
3680 }
3681 else {
3682 arg = PyUnicode_FromObject(arg);
3683 if (!arg)
3684 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003685 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003686 Py_DECREF(arg);
3687 if (!output)
3688 return 0;
3689 if (!PyBytes_Check(output)) {
3690 Py_DECREF(output);
3691 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3692 return 0;
3693 }
3694 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003695 size = PyBytes_GET_SIZE(output);
3696 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003697 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003698 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003699 Py_DECREF(output);
3700 return 0;
3701 }
3702 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003703 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003704}
3705
3706
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003707int
3708PyUnicode_FSDecoder(PyObject* arg, void* addr)
3709{
3710 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003711 if (arg == NULL) {
3712 Py_DECREF(*(PyObject**)addr);
3713 return 1;
3714 }
3715 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003716 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003718 output = arg;
3719 Py_INCREF(output);
3720 }
3721 else {
3722 arg = PyBytes_FromObject(arg);
3723 if (!arg)
3724 return 0;
3725 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3726 PyBytes_GET_SIZE(arg));
3727 Py_DECREF(arg);
3728 if (!output)
3729 return 0;
3730 if (!PyUnicode_Check(output)) {
3731 Py_DECREF(output);
3732 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3733 return 0;
3734 }
3735 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003736 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003737 Py_DECREF(output);
3738 return 0;
3739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003741 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003742 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3743 Py_DECREF(output);
3744 return 0;
3745 }
3746 *(PyObject**)addr = output;
3747 return Py_CLEANUP_SUPPORTED;
3748}
3749
3750
Martin v. Löwis5b222132007-06-10 09:51:05 +00003751char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003753{
Christian Heimesf3863112007-11-22 07:46:41 +00003754 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 return NULL;
3759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003761 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003763 if (PyUnicode_UTF8(unicode) == NULL) {
3764 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3766 if (bytes == NULL)
3767 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3769 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003770 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 Py_DECREF(bytes);
3772 return NULL;
3773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3775 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3776 PyBytes_AS_STRING(bytes),
3777 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778 Py_DECREF(bytes);
3779 }
3780
3781 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003782 *psize = PyUnicode_UTF8_LENGTH(unicode);
3783 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003784}
3785
3786char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3790}
3791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003792Py_UNICODE *
3793PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 const unsigned char *one_byte;
3796#if SIZEOF_WCHAR_T == 4
3797 const Py_UCS2 *two_bytes;
3798#else
3799 const Py_UCS4 *four_bytes;
3800 const Py_UCS4 *ucs4_end;
3801 Py_ssize_t num_surrogates;
3802#endif
3803 wchar_t *w;
3804 wchar_t *wchar_end;
3805
3806 if (!PyUnicode_Check(unicode)) {
3807 PyErr_BadArgument();
3808 return NULL;
3809 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003812 assert(_PyUnicode_KIND(unicode) != 0);
3813 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3818 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819 num_surrogates = 0;
3820
3821 for (; four_bytes < ucs4_end; ++four_bytes) {
3822 if (*four_bytes > 0xFFFF)
3823 ++num_surrogates;
3824 }
3825
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3827 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3828 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 PyErr_NoMemory();
3830 return NULL;
3831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 w = _PyUnicode_WSTR(unicode);
3835 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3836 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3838 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003839 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003841 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3842 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 }
3844 else
3845 *w = *four_bytes;
3846
3847 if (w > wchar_end) {
3848 assert(0 && "Miscalculated string end");
3849 }
3850 }
3851 *w = 0;
3852#else
3853 /* sizeof(wchar_t) == 4 */
3854 Py_FatalError("Impossible unicode object state, wstr and str "
3855 "should share memory already.");
3856 return NULL;
3857#endif
3858 }
3859 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3861 (_PyUnicode_LENGTH(unicode) + 1));
3862 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 PyErr_NoMemory();
3864 return NULL;
3865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3867 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3868 w = _PyUnicode_WSTR(unicode);
3869 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003871 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3872 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 for (; w < wchar_end; ++one_byte, ++w)
3874 *w = *one_byte;
3875 /* null-terminate the wstr */
3876 *w = 0;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 for (; w < wchar_end; ++two_bytes, ++w)
3882 *w = *two_bytes;
3883 /* null-terminate the wstr */
3884 *w = 0;
3885#else
3886 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 PyObject_FREE(_PyUnicode_WSTR(unicode));
3888 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 Py_FatalError("Impossible unicode object state, wstr "
3890 "and str should share memory already.");
3891 return NULL;
3892#endif
3893 }
3894 else {
3895 assert(0 && "This should never happen.");
3896 }
3897 }
3898 }
3899 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 *size = PyUnicode_WSTR_LENGTH(unicode);
3901 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003902}
3903
Alexander Belopolsky40018472011-02-26 01:02:56 +00003904Py_UNICODE *
3905PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908}
3909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910
Alexander Belopolsky40018472011-02-26 01:02:56 +00003911Py_ssize_t
3912PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913{
3914 if (!PyUnicode_Check(unicode)) {
3915 PyErr_BadArgument();
3916 goto onError;
3917 }
3918 return PyUnicode_GET_SIZE(unicode);
3919
Benjamin Peterson29060642009-01-31 22:14:21 +00003920 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 return -1;
3922}
3923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924Py_ssize_t
3925PyUnicode_GetLength(PyObject *unicode)
3926{
Victor Stinner07621332012-06-16 04:53:46 +02003927 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928 PyErr_BadArgument();
3929 return -1;
3930 }
Victor Stinner07621332012-06-16 04:53:46 +02003931 if (PyUnicode_READY(unicode) == -1)
3932 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 return PyUnicode_GET_LENGTH(unicode);
3934}
3935
3936Py_UCS4
3937PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3938{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003939 void *data;
3940 int kind;
3941
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003942 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3943 PyErr_BadArgument();
3944 return (Py_UCS4)-1;
3945 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003946 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003947 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 return (Py_UCS4)-1;
3949 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003950 data = PyUnicode_DATA(unicode);
3951 kind = PyUnicode_KIND(unicode);
3952 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953}
3954
3955int
3956PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3957{
3958 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003959 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 return -1;
3961 }
Victor Stinner488fa492011-12-12 00:01:39 +01003962 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003963 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003964 PyErr_SetString(PyExc_IndexError, "string index out of range");
3965 return -1;
3966 }
Victor Stinner488fa492011-12-12 00:01:39 +01003967 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003968 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003969 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3970 PyErr_SetString(PyExc_ValueError, "character out of range");
3971 return -1;
3972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3974 index, ch);
3975 return 0;
3976}
3977
Alexander Belopolsky40018472011-02-26 01:02:56 +00003978const char *
3979PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003980{
Victor Stinner42cb4622010-09-01 19:39:01 +00003981 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003982}
3983
Victor Stinner554f3f02010-06-16 23:33:54 +00003984/* create or adjust a UnicodeDecodeError */
3985static void
3986make_decode_exception(PyObject **exceptionObject,
3987 const char *encoding,
3988 const char *input, Py_ssize_t length,
3989 Py_ssize_t startpos, Py_ssize_t endpos,
3990 const char *reason)
3991{
3992 if (*exceptionObject == NULL) {
3993 *exceptionObject = PyUnicodeDecodeError_Create(
3994 encoding, input, length, startpos, endpos, reason);
3995 }
3996 else {
3997 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3998 goto onError;
3999 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4000 goto onError;
4001 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4002 goto onError;
4003 }
4004 return;
4005
4006onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004007 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004008}
4009
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004010#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011/* error handling callback helper:
4012 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004013 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 and adjust various state variables.
4015 return 0 on success, -1 on error
4016*/
4017
Alexander Belopolsky40018472011-02-26 01:02:56 +00004018static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004019unicode_decode_call_errorhandler_wchar(
4020 const char *errors, PyObject **errorHandler,
4021 const char *encoding, const char *reason,
4022 const char **input, const char **inend, Py_ssize_t *startinpos,
4023 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4024 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004026 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027
4028 PyObject *restuple = NULL;
4029 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004030 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004031 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004032 Py_ssize_t requiredsize;
4033 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004034 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004035 wchar_t *repwstr;
4036 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004038 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4039 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 *errorHandler = PyCodec_LookupError(errors);
4043 if (*errorHandler == NULL)
4044 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 }
4046
Victor Stinner554f3f02010-06-16 23:33:54 +00004047 make_decode_exception(exceptionObject,
4048 encoding,
4049 *input, *inend - *input,
4050 *startinpos, *endinpos,
4051 reason);
4052 if (*exceptionObject == NULL)
4053 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
4055 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4056 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004059 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 }
4062 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004064
4065 /* Copy back the bytes variables, which might have been modified by the
4066 callback */
4067 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4068 if (!inputobj)
4069 goto onError;
4070 if (!PyBytes_Check(inputobj)) {
4071 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4072 }
4073 *input = PyBytes_AS_STRING(inputobj);
4074 insize = PyBytes_GET_SIZE(inputobj);
4075 *inend = *input + insize;
4076 /* we can DECREF safely, as the exception has another reference,
4077 so the object won't go away. */
4078 Py_DECREF(inputobj);
4079
4080 if (newpos<0)
4081 newpos = insize+newpos;
4082 if (newpos<0 || newpos>insize) {
4083 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4084 goto onError;
4085 }
4086
4087 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4088 if (repwstr == NULL)
4089 goto onError;
4090 /* need more space? (at least enough for what we
4091 have+the replacement+the rest of the string (starting
4092 at the new input position), so we won't have to check space
4093 when there are no errors in the rest of the string) */
4094 requiredsize = *outpos + repwlen + insize-newpos;
4095 if (requiredsize > outsize) {
4096 if (requiredsize < 2*outsize)
4097 requiredsize = 2*outsize;
4098 if (unicode_resize(output, requiredsize) < 0)
4099 goto onError;
4100 }
4101 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4102 *outpos += repwlen;
4103
4104 *endinpos = newpos;
4105 *inptr = *input + newpos;
4106
4107 /* we made it! */
4108 Py_XDECREF(restuple);
4109 return 0;
4110
4111 onError:
4112 Py_XDECREF(restuple);
4113 return -1;
4114}
4115#endif /* HAVE_MBCS */
4116
4117static int
4118unicode_decode_call_errorhandler_writer(
4119 const char *errors, PyObject **errorHandler,
4120 const char *encoding, const char *reason,
4121 const char **input, const char **inend, Py_ssize_t *startinpos,
4122 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4123 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4124{
4125 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4126
4127 PyObject *restuple = NULL;
4128 PyObject *repunicode = NULL;
4129 Py_ssize_t insize;
4130 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004131 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004132 PyObject *inputobj = NULL;
4133
4134 if (*errorHandler == NULL) {
4135 *errorHandler = PyCodec_LookupError(errors);
4136 if (*errorHandler == NULL)
4137 goto onError;
4138 }
4139
4140 make_decode_exception(exceptionObject,
4141 encoding,
4142 *input, *inend - *input,
4143 *startinpos, *endinpos,
4144 reason);
4145 if (*exceptionObject == NULL)
4146 goto onError;
4147
4148 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4149 if (restuple == NULL)
4150 goto onError;
4151 if (!PyTuple_Check(restuple)) {
4152 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4153 goto onError;
4154 }
4155 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004156 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
4158 /* Copy back the bytes variables, which might have been modified by the
4159 callback */
4160 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4161 if (!inputobj)
4162 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004163 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004165 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004166 *input = PyBytes_AS_STRING(inputobj);
4167 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004168 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004169 /* we can DECREF safely, as the exception has another reference,
4170 so the object won't go away. */
4171 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004175 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4177 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179
Victor Stinner8f674cc2013-04-17 23:02:17 +02004180 if (PyUnicode_READY(repunicode) < 0)
4181 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004182 replen = PyUnicode_GET_LENGTH(repunicode);
4183 writer->min_length += replen;
4184 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004185 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004186 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004187 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004190 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 Py_XDECREF(restuple);
4194 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199}
4200
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004201/* --- UTF-7 Codec -------------------------------------------------------- */
4202
Antoine Pitrou244651a2009-05-04 18:56:13 +00004203/* See RFC2152 for details. We encode conservatively and decode liberally. */
4204
4205/* Three simple macros defining base-64. */
4206
4207/* Is c a base-64 character? */
4208
4209#define IS_BASE64(c) \
4210 (((c) >= 'A' && (c) <= 'Z') || \
4211 ((c) >= 'a' && (c) <= 'z') || \
4212 ((c) >= '0' && (c) <= '9') || \
4213 (c) == '+' || (c) == '/')
4214
4215/* given that c is a base-64 character, what is its base-64 value? */
4216
4217#define FROM_BASE64(c) \
4218 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4219 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4220 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4221 (c) == '+' ? 62 : 63)
4222
4223/* What is the base-64 character of the bottom 6 bits of n? */
4224
4225#define TO_BASE64(n) \
4226 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4227
4228/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4229 * decoded as itself. We are permissive on decoding; the only ASCII
4230 * byte not decoding to itself is the + which begins a base64
4231 * string. */
4232
4233#define DECODE_DIRECT(c) \
4234 ((c) <= 127 && (c) != '+')
4235
4236/* The UTF-7 encoder treats ASCII characters differently according to
4237 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4238 * the above). See RFC2152. This array identifies these different
4239 * sets:
4240 * 0 : "Set D"
4241 * alphanumeric and '(),-./:?
4242 * 1 : "Set O"
4243 * !"#$%&*;<=>@[]^_`{|}
4244 * 2 : "whitespace"
4245 * ht nl cr sp
4246 * 3 : special (must be base64 encoded)
4247 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4248 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004249
Tim Petersced69f82003-09-16 20:30:58 +00004250static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004251char utf7_category[128] = {
4252/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4253 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4254/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4255 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4256/* sp ! " # $ % & ' ( ) * + , - . / */
4257 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4258/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4260/* @ A B C D E F G H I J K L M N O */
4261 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4262/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4264/* ` a b c d e f g h i j k l m n o */
4265 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4266/* p q r s t u v w x y z { | } ~ del */
4267 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268};
4269
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270/* ENCODE_DIRECT: this character should be encoded as itself. The
4271 * answer depends on whether we are encoding set O as itself, and also
4272 * on whether we are encoding whitespace as itself. RFC2152 makes it
4273 * clear that the answers to these questions vary between
4274 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004275
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276#define ENCODE_DIRECT(c, directO, directWS) \
4277 ((c) < 128 && (c) > 0 && \
4278 ((utf7_category[(c)] == 0) || \
4279 (directWS && (utf7_category[(c)] == 2)) || \
4280 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004281
Alexander Belopolsky40018472011-02-26 01:02:56 +00004282PyObject *
4283PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004284 Py_ssize_t size,
4285 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004287 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4288}
4289
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290/* The decoder. The only state we preserve is our read position,
4291 * i.e. how many characters we have consumed. So if we end in the
4292 * middle of a shift sequence we have to back off the read position
4293 * and the output to the beginning of the sequence, otherwise we lose
4294 * all the shift state (seen bits, number of bits seen, high
4295 * surrogate). */
4296
Alexander Belopolsky40018472011-02-26 01:02:56 +00004297PyObject *
4298PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004299 Py_ssize_t size,
4300 const char *errors,
4301 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004304 Py_ssize_t startinpos;
4305 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004307 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308 const char *errmsg = "";
4309 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004310 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 unsigned int base64bits = 0;
4312 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004313 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 PyObject *errorHandler = NULL;
4315 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317 if (size == 0) {
4318 if (consumed)
4319 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004320 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004321 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004324 _PyUnicodeWriter_Init(&writer);
4325 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326
4327 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328 e = s + size;
4329
4330 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004333 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335 if (inShift) { /* in a base-64 section */
4336 if (IS_BASE64(ch)) { /* consume a base-64 character */
4337 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4338 base64bits += 6;
4339 s++;
4340 if (base64bits >= 16) {
4341 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004342 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 base64bits -= 16;
4344 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004345 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004346 if (surrogate) {
4347 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004348 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4349 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004350 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004353 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 }
4355 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004356 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004357 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
4360 }
Victor Stinner551ac952011-11-29 22:58:13 +01004361 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 /* first surrogate */
4363 surrogate = outCh;
4364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004366 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
4369 }
4370 }
4371 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 inShift = 0;
4373 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004375 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004376 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004377 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (base64bits > 0) { /* left-over bits */
4380 if (base64bits >= 6) {
4381 /* We've seen at least one base-64 character */
4382 errmsg = "partial character in shift sequence";
4383 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 else {
4386 /* Some bits remain; they should be zero */
4387 if (base64buffer != 0) {
4388 errmsg = "non-zero padding bits in shift sequence";
4389 goto utf7Error;
4390 }
4391 }
4392 }
4393 if (ch != '-') {
4394 /* '-' is absorbed; other terminating
4395 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004396 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004397 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
4400 }
4401 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 s++; /* consume '+' */
4404 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004406 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 }
4409 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004413 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
4415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004418 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else {
4422 startinpos = s-starts;
4423 s++;
4424 errmsg = "unexpected special character";
4425 goto utf7Error;
4426 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 errors, &errorHandler,
4432 "utf7", errmsg,
4433 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004434 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436 }
4437
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 /* end of string */
4439
4440 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4441 /* if we're in an inconsistent state, that's an error */
4442 if (surrogate ||
4443 (base64bits >= 6) ||
4444 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 errors, &errorHandler,
4448 "utf7", "unterminated shift sequence",
4449 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 goto onError;
4452 if (s < e)
4453 goto restart;
4454 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456
4457 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004461 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004462 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004463 writer.kind, writer.data, shiftOutStart);
4464 Py_XDECREF(errorHandler);
4465 Py_XDECREF(exc);
4466 _PyUnicodeWriter_Dealloc(&writer);
4467 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004468 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004469 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 }
4471 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004474 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(errorHandler);
4477 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004478 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 Py_XDECREF(errorHandler);
4482 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 return NULL;
4485}
4486
4487
Alexander Belopolsky40018472011-02-26 01:02:56 +00004488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489_PyUnicode_EncodeUTF7(PyObject *str,
4490 int base64SetO,
4491 int base64WhiteSpace,
4492 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004494 int kind;
4495 void *data;
4496 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004497 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 unsigned int base64bits = 0;
4501 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 char * out;
4503 char * start;
4504
Benjamin Petersonbac79492012-01-14 13:34:47 -05004505 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004506 return NULL;
4507 kind = PyUnicode_KIND(str);
4508 data = PyUnicode_DATA(str);
4509 len = PyUnicode_GET_LENGTH(str);
4510
4511 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004514 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004515 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004516 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004517 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 if (v == NULL)
4519 return NULL;
4520
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004521 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004522 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004523 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 if (inShift) {
4526 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4527 /* shifting out */
4528 if (base64bits) { /* output remaining bits */
4529 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4530 base64buffer = 0;
4531 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 }
4533 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 /* Characters not in the BASE64 set implicitly unshift the sequence
4535 so no '-' is required, except if the character is itself a '-' */
4536 if (IS_BASE64(ch) || ch == '-') {
4537 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 *out++ = (char) ch;
4540 }
4541 else {
4542 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004543 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 else { /* not in a shift sequence */
4546 if (ch == '+') {
4547 *out++ = '+';
4548 *out++ = '-';
4549 }
4550 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4551 *out++ = (char) ch;
4552 }
4553 else {
4554 *out++ = '+';
4555 inShift = 1;
4556 goto encode_char;
4557 }
4558 }
4559 continue;
4560encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004562 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004563
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 /* code first surrogate */
4565 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004566 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 while (base64bits >= 6) {
4568 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4569 base64bits -= 6;
4570 }
4571 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004572 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits += 16;
4575 base64buffer = (base64buffer << 16) | ch;
4576 while (base64bits >= 6) {
4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4578 base64bits -= 6;
4579 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004580 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (base64bits)
4582 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4583 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004585 if (_PyBytes_Resize(&v, out - start) < 0)
4586 return NULL;
4587 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589PyObject *
4590PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4591 Py_ssize_t size,
4592 int base64SetO,
4593 int base64WhiteSpace,
4594 const char *errors)
4595{
4596 PyObject *result;
4597 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4598 if (tmp == NULL)
4599 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004600 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004601 base64WhiteSpace, errors);
4602 Py_DECREF(tmp);
4603 return result;
4604}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606#undef IS_BASE64
4607#undef FROM_BASE64
4608#undef TO_BASE64
4609#undef DECODE_DIRECT
4610#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612/* --- UTF-8 Codec -------------------------------------------------------- */
4613
Alexander Belopolsky40018472011-02-26 01:02:56 +00004614PyObject *
4615PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004616 Py_ssize_t size,
4617 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618{
Walter Dörwald69652032004-09-07 20:24:22 +00004619 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4620}
4621
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004622#include "stringlib/asciilib.h"
4623#include "stringlib/codecs.h"
4624#include "stringlib/undef.h"
4625
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004626#include "stringlib/ucs1lib.h"
4627#include "stringlib/codecs.h"
4628#include "stringlib/undef.h"
4629
4630#include "stringlib/ucs2lib.h"
4631#include "stringlib/codecs.h"
4632#include "stringlib/undef.h"
4633
4634#include "stringlib/ucs4lib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
Antoine Pitrouab868312009-01-10 15:40:25 +00004638/* Mask to quickly check whether a C 'long' contains a
4639 non-ASCII, UTF8-encoded char. */
4640#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004641# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004642#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004643# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004644#else
4645# error C 'long' size should be either 4 or 8!
4646#endif
4647
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648static Py_ssize_t
4649ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004652 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004654 /*
4655 * Issue #17237: m68k is a bit different from most architectures in
4656 * that objects do not use "natural alignment" - for example, int and
4657 * long are only aligned at 2-byte boundaries. Therefore the assert()
4658 * won't work; also, tests have shown that skipping the "optimised
4659 * version" will even speed up m68k.
4660 */
4661#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004663 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4664 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665 /* Fast path, see in STRINGLIB(utf8_decode) for
4666 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004667 /* Help allocation */
4668 const char *_p = p;
4669 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 while (_p < aligned_end) {
4671 unsigned long value = *(const unsigned long *) _p;
4672 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 *((unsigned long *)q) = value;
4675 _p += SIZEOF_LONG;
4676 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004677 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678 p = _p;
4679 while (p < end) {
4680 if ((unsigned char)*p & 0x80)
4681 break;
4682 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004687#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 while (p < end) {
4689 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4690 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004691 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004692 /* Help allocation */
4693 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004694 while (_p < aligned_end) {
4695 unsigned long value = *(unsigned long *) _p;
4696 if (value & ASCII_CHAR_MASK)
4697 break;
4698 _p += SIZEOF_LONG;
4699 }
4700 p = _p;
4701 if (_p == end)
4702 break;
4703 }
4704 if ((unsigned char)*p & 0x80)
4705 break;
4706 ++p;
4707 }
4708 memcpy(dest, start, p - start);
4709 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710}
Antoine Pitrouab868312009-01-10 15:40:25 +00004711
Victor Stinner785938e2011-12-11 20:09:03 +01004712PyObject *
4713PyUnicode_DecodeUTF8Stateful(const char *s,
4714 Py_ssize_t size,
4715 const char *errors,
4716 Py_ssize_t *consumed)
4717{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004719 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721
4722 Py_ssize_t startinpos;
4723 Py_ssize_t endinpos;
4724 const char *errmsg = "";
4725 PyObject *errorHandler = NULL;
4726 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004727
4728 if (size == 0) {
4729 if (consumed)
4730 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004731 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004732 }
4733
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4735 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004736 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737 *consumed = 1;
4738 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004739 }
4740
Victor Stinner8f674cc2013-04-17 23:02:17 +02004741 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004742 writer.min_length = size;
4743 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004744 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004745
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 writer.pos = ascii_decode(s, end, writer.data);
4747 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 while (s < end) {
4749 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004750 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004752 if (PyUnicode_IS_ASCII(writer.buffer))
4753 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004757 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 } else {
4759 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004760 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 }
4762
4763 switch (ch) {
4764 case 0:
4765 if (s == end || consumed)
4766 goto End;
4767 errmsg = "unexpected end of data";
4768 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004769 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 break;
4771 case 1:
4772 errmsg = "invalid start byte";
4773 startinpos = s - starts;
4774 endinpos = startinpos + 1;
4775 break;
4776 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004777 case 3:
4778 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 errmsg = "invalid continuation byte";
4780 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004781 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 break;
4783 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004784 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785 goto onError;
4786 continue;
4787 }
4788
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004789 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790 errors, &errorHandler,
4791 "utf-8", errmsg,
4792 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004793 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004795 }
4796
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 if (consumed)
4799 *consumed = s - starts;
4800
4801 Py_XDECREF(errorHandler);
4802 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004803 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804
4805onError:
4806 Py_XDECREF(errorHandler);
4807 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004810}
4811
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004812#ifdef __APPLE__
4813
4814/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004815 used to decode the command line arguments on Mac OS X.
4816
4817 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004818 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004819
4820wchar_t*
4821_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4822{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824 wchar_t *unicode;
4825 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004826
4827 /* Note: size will always be longer than the resulting Unicode
4828 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004829 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004830 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004831 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 if (!unicode)
4833 return NULL;
4834
4835 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004842#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845 if (ch > 0xFF) {
4846#if SIZEOF_WCHAR_T == 4
4847 assert(0);
4848#else
4849 assert(Py_UNICODE_IS_SURROGATE(ch));
4850 /* compute and append the two surrogates: */
4851 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4852 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4853#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 else {
4856 if (!ch && s == e)
4857 break;
4858 /* surrogateescape */
4859 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4860 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004861 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 return unicode;
4864}
4865
4866#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868/* Primary internal function which creates utf8 encoded bytes objects.
4869
4870 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004871 and allocate exactly as much space needed at the end. Else allocate the
4872 maximum possible needed (4 result bytes per Unicode character), and return
4873 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004874*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004875PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004876_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877{
Victor Stinner6099a032011-12-18 14:22:26 +01004878 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879 void *data;
4880 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 if (!PyUnicode_Check(unicode)) {
4883 PyErr_BadArgument();
4884 return NULL;
4885 }
4886
4887 if (PyUnicode_READY(unicode) == -1)
4888 return NULL;
4889
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004890 if (PyUnicode_UTF8(unicode))
4891 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4892 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
4894 kind = PyUnicode_KIND(unicode);
4895 data = PyUnicode_DATA(unicode);
4896 size = PyUnicode_GET_LENGTH(unicode);
4897
Benjamin Petersonead6b532011-12-20 17:23:42 -06004898 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004899 default:
4900 assert(0);
4901 case PyUnicode_1BYTE_KIND:
4902 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4903 assert(!PyUnicode_IS_ASCII(unicode));
4904 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4905 case PyUnicode_2BYTE_KIND:
4906 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4907 case PyUnicode_4BYTE_KIND:
4908 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910}
4911
Alexander Belopolsky40018472011-02-26 01:02:56 +00004912PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004913PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4914 Py_ssize_t size,
4915 const char *errors)
4916{
4917 PyObject *v, *unicode;
4918
4919 unicode = PyUnicode_FromUnicode(s, size);
4920 if (unicode == NULL)
4921 return NULL;
4922 v = _PyUnicode_AsUTF8String(unicode, errors);
4923 Py_DECREF(unicode);
4924 return v;
4925}
4926
4927PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004928PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004930 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931}
4932
Walter Dörwald41980ca2007-08-16 21:55:45 +00004933/* --- UTF-32 Codec ------------------------------------------------------- */
4934
4935PyObject *
4936PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 Py_ssize_t size,
4938 const char *errors,
4939 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940{
4941 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4942}
4943
4944PyObject *
4945PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors,
4948 int *byteorder,
4949 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950{
4951 const char *starts = s;
4952 Py_ssize_t startinpos;
4953 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004954 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004955 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004956 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004957 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 PyObject *errorHandler = NULL;
4960 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004961
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962 q = (unsigned char *)s;
4963 e = q + size;
4964
4965 if (byteorder)
4966 bo = *byteorder;
4967
4968 /* Check for BOM marks (U+FEFF) in the input and adjust current
4969 byte order setting accordingly. In native mode, the leading BOM
4970 mark is skipped, in all other modes, it is copied to the output
4971 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004972 if (bo == 0 && size >= 4) {
4973 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4974 if (bom == 0x0000FEFF) {
4975 bo = -1;
4976 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004978 else if (bom == 0xFFFE0000) {
4979 bo = 1;
4980 q += 4;
4981 }
4982 if (byteorder)
4983 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 }
4985
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 if (q == e) {
4987 if (consumed)
4988 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004989 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990 }
4991
Victor Stinnere64322e2012-10-30 23:12:47 +01004992#ifdef WORDS_BIGENDIAN
4993 le = bo < 0;
4994#else
4995 le = bo <= 0;
4996#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004997 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004998
Victor Stinner8f674cc2013-04-17 23:02:17 +02004999 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005000 writer.min_length = (e - q + 3) / 4;
5001 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005003
Victor Stinnere64322e2012-10-30 23:12:47 +01005004 while (1) {
5005 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005007
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 enum PyUnicode_Kind kind = writer.kind;
5010 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005011 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 if (le) {
5014 do {
5015 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5016 if (ch > maxch)
5017 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005018 if (kind != PyUnicode_1BYTE_KIND &&
5019 Py_UNICODE_IS_SURROGATE(ch))
5020 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005021 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 q += 4;
5023 } while (q <= last);
5024 }
5025 else {
5026 do {
5027 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5028 if (ch > maxch)
5029 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005030 if (kind != PyUnicode_1BYTE_KIND &&
5031 Py_UNICODE_IS_SURROGATE(ch))
5032 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005033 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 q += 4;
5035 } while (q <= last);
5036 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005037 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005038 }
5039
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005040 if (Py_UNICODE_IS_SURROGATE(ch)) {
5041 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5042 startinpos = ((const char *)q) - starts;
5043 endinpos = startinpos + 4;
5044 }
5045 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005046 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005048 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005050 startinpos = ((const char *)q) - starts;
5051 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005053 else {
5054 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005055 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005056 goto onError;
5057 q += 4;
5058 continue;
5059 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005061 startinpos = ((const char *)q) - starts;
5062 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005064
5065 /* The remaining input chars are ignored if the callback
5066 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005067 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005069 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005071 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073 }
5074
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 Py_XDECREF(errorHandler);
5079 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005080 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005083 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 Py_XDECREF(errorHandler);
5085 Py_XDECREF(exc);
5086 return NULL;
5087}
5088
5089PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005090_PyUnicode_EncodeUTF32(PyObject *str,
5091 const char *errors,
5092 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005094 int kind;
5095 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005096 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005098 unsigned char *p;
5099 Py_ssize_t nsize, i;
5100 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005101#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005102 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005104 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005106 const char *encoding;
5107 PyObject *errorHandler = NULL;
5108 PyObject *exc = NULL;
5109 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110
Serhiy Storchaka30793282014-01-04 22:44:01 +02005111#define STORECHAR(CH) \
5112 do { \
5113 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5114 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5115 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5116 p[iorder[0]] = (CH) & 0xff; \
5117 p += 4; \
5118 } while(0)
5119
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (!PyUnicode_Check(str)) {
5121 PyErr_BadArgument();
5122 return NULL;
5123 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005124 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005125 return NULL;
5126 kind = PyUnicode_KIND(str);
5127 data = PyUnicode_DATA(str);
5128 len = PyUnicode_GET_LENGTH(str);
5129
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005130 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005131 if (nsize > PY_SSIZE_T_MAX / 4)
5132 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005133 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (v == NULL)
5135 return NULL;
5136
Serhiy Storchaka30793282014-01-04 22:44:01 +02005137 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005139 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005140 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005141 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142
Serhiy Storchaka30793282014-01-04 22:44:01 +02005143 if (byteorder == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005149 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005150 }
5151 else if (byteorder == 1) {
5152 /* force BE */
5153 iorder[0] = 3;
5154 iorder[1] = 2;
5155 iorder[2] = 1;
5156 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005157 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005158 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005159 else
5160 encoding = "utf-32";
5161
5162 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005163 for (i = 0; i < len; i++)
5164 STORECHAR(PyUnicode_READ(kind, data, i));
5165 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166 }
5167
Serhiy Storchaka30793282014-01-04 22:44:01 +02005168 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005169 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005170 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5171 i++;
5172 assert(ch <= MAX_UNICODE);
5173 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5174 STORECHAR(ch);
5175 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005177
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005178 rep = unicode_encode_call_errorhandler(
5179 errors, &errorHandler,
5180 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005181 str, &exc, i-1, i, &i);
5182
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 if (!rep)
5184 goto error;
5185
5186 if (PyBytes_Check(rep)) {
5187 repsize = PyBytes_GET_SIZE(rep);
5188 if (repsize & 3) {
5189 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005190 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 "surrogates not allowed");
5192 goto error;
5193 }
5194 moreunits = repsize / 4;
5195 }
5196 else {
5197 assert(PyUnicode_Check(rep));
5198 if (PyUnicode_READY(rep) < 0)
5199 goto error;
5200 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5201 if (!PyUnicode_IS_ASCII(rep)) {
5202 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005203 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005204 "surrogates not allowed");
5205 goto error;
5206 }
5207 }
5208
5209 /* four bytes are reserved for each surrogate */
5210 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005211 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005212 Py_ssize_t morebytes = 4 * (moreunits - 1);
5213 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5214 /* integer overflow */
5215 PyErr_NoMemory();
5216 goto error;
5217 }
5218 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5219 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005220 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005221 }
5222
5223 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005224 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5225 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005226 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005227 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005228 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005229 repdata = PyUnicode_1BYTE_DATA(rep);
5230 while (repsize--) {
5231 Py_UCS4 ch = *repdata++;
5232 STORECHAR(ch);
5233 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005234 }
5235
5236 Py_CLEAR(rep);
5237 }
5238
5239 /* Cut back to size actually needed. This is necessary for, for example,
5240 encoding of a string containing isolated surrogates and the 'ignore'
5241 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005242 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005243 if (nsize != PyBytes_GET_SIZE(v))
5244 _PyBytes_Resize(&v, nsize);
5245 Py_XDECREF(errorHandler);
5246 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005247 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005248 error:
5249 Py_XDECREF(rep);
5250 Py_XDECREF(errorHandler);
5251 Py_XDECREF(exc);
5252 Py_XDECREF(v);
5253 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005254#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255}
5256
Alexander Belopolsky40018472011-02-26 01:02:56 +00005257PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005258PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5259 Py_ssize_t size,
5260 const char *errors,
5261 int byteorder)
5262{
5263 PyObject *result;
5264 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5265 if (tmp == NULL)
5266 return NULL;
5267 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5268 Py_DECREF(tmp);
5269 return result;
5270}
5271
5272PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005273PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274{
Victor Stinnerb960b342011-11-20 19:12:52 +01005275 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276}
5277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278/* --- UTF-16 Codec ------------------------------------------------------- */
5279
Tim Peters772747b2001-08-09 22:21:55 +00005280PyObject *
5281PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 Py_ssize_t size,
5283 const char *errors,
5284 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
Walter Dörwald69652032004-09-07 20:24:22 +00005286 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5287}
5288
5289PyObject *
5290PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 Py_ssize_t size,
5292 const char *errors,
5293 int *byteorder,
5294 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t startinpos;
5298 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005300 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005301 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005302 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005303 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 PyObject *errorHandler = NULL;
5305 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005306 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Tim Peters772747b2001-08-09 22:21:55 +00005308 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005309 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
5311 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005312 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005314 /* Check for BOM marks (U+FEFF) in the input and adjust current
5315 byte order setting accordingly. In native mode, the leading BOM
5316 mark is skipped, in all other modes, it is copied to the output
5317 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 if (bo == 0 && size >= 2) {
5319 const Py_UCS4 bom = (q[1] << 8) | q[0];
5320 if (bom == 0xFEFF) {
5321 q += 2;
5322 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005324 else if (bom == 0xFFFE) {
5325 q += 2;
5326 bo = 1;
5327 }
5328 if (byteorder)
5329 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 if (q == e) {
5333 if (consumed)
5334 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005335 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005336 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005337
Christian Heimes743e0cd2012-10-17 23:52:17 +02005338#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005339 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005340 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005341#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005344#endif
Tim Peters772747b2001-08-09 22:21:55 +00005345
Antoine Pitrou63065d72012-05-15 23:48:04 +02005346 /* Note: size will always be longer than the resulting Unicode
5347 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005348 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005349 writer.min_length = (e - q + 1) / 2;
5350 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005351 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005352
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 while (1) {
5354 Py_UCS4 ch = 0;
5355 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005356 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005357 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005359 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005361 native_ordering);
5362 else
5363 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005365 native_ordering);
5366 } else if (kind == PyUnicode_2BYTE_KIND) {
5367 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005369 native_ordering);
5370 } else {
5371 assert(kind == PyUnicode_4BYTE_KIND);
5372 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005374 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005375 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377
Antoine Pitrou63065d72012-05-15 23:48:04 +02005378 switch (ch)
5379 {
5380 case 0:
5381 /* remaining byte at the end? (size should be even) */
5382 if (q == e || consumed)
5383 goto End;
5384 errmsg = "truncated data";
5385 startinpos = ((const char *)q) - starts;
5386 endinpos = ((const char *)e) - starts;
5387 break;
5388 /* The remaining input chars are ignored if the callback
5389 chooses to skip the input */
5390 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005391 q -= 2;
5392 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005393 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005394 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005395 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005396 endinpos = ((const char *)e) - starts;
5397 break;
5398 case 2:
5399 errmsg = "illegal encoding";
5400 startinpos = ((const char *)q) - 2 - starts;
5401 endinpos = startinpos + 2;
5402 break;
5403 case 3:
5404 errmsg = "illegal UTF-16 surrogate";
5405 startinpos = ((const char *)q) - 4 - starts;
5406 endinpos = startinpos + 2;
5407 break;
5408 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005409 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005410 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 continue;
5412 }
5413
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005414 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005415 errors,
5416 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005417 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005418 &starts,
5419 (const char **)&e,
5420 &startinpos,
5421 &endinpos,
5422 &exc,
5423 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005424 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 }
5427
Antoine Pitrou63065d72012-05-15 23:48:04 +02005428End:
Walter Dörwald69652032004-09-07 20:24:22 +00005429 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 Py_XDECREF(errorHandler);
5433 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005434 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005437 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 Py_XDECREF(errorHandler);
5439 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return NULL;
5441}
5442
Tim Peters772747b2001-08-09 22:21:55 +00005443PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444_PyUnicode_EncodeUTF16(PyObject *str,
5445 const char *errors,
5446 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005448 enum PyUnicode_Kind kind;
5449 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005450 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005451 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005452 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005453 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005454#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005455 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005456#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005457 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005458#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 const char *encoding;
5460 Py_ssize_t nsize, pos;
5461 PyObject *errorHandler = NULL;
5462 PyObject *exc = NULL;
5463 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 if (!PyUnicode_Check(str)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005469 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005470 return NULL;
5471 kind = PyUnicode_KIND(str);
5472 data = PyUnicode_DATA(str);
5473 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005474
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005476 if (kind == PyUnicode_4BYTE_KIND) {
5477 const Py_UCS4 *in = (const Py_UCS4 *)data;
5478 const Py_UCS4 *end = in + len;
5479 while (in < end)
5480 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005481 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005482 }
5483 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 nsize = len + pairs + (byteorder == 0);
5486 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 if (v == NULL)
5488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005490 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005491 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005492 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005494 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005495 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005496 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005497
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 if (kind == PyUnicode_1BYTE_KIND) {
5499 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5500 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005501 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005502
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 if (byteorder < 0)
5504 encoding = "utf-16-le";
5505 else if (byteorder > 0)
5506 encoding = "utf-16-be";
5507 else
5508 encoding = "utf-16";
5509
5510 pos = 0;
5511 while (pos < len) {
5512 Py_ssize_t repsize, moreunits;
5513
5514 if (kind == PyUnicode_2BYTE_KIND) {
5515 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5516 &out, native_ordering);
5517 }
5518 else {
5519 assert(kind == PyUnicode_4BYTE_KIND);
5520 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5521 &out, native_ordering);
5522 }
5523 if (pos == len)
5524 break;
5525
5526 rep = unicode_encode_call_errorhandler(
5527 errors, &errorHandler,
5528 encoding, "surrogates not allowed",
5529 str, &exc, pos, pos + 1, &pos);
5530 if (!rep)
5531 goto error;
5532
5533 if (PyBytes_Check(rep)) {
5534 repsize = PyBytes_GET_SIZE(rep);
5535 if (repsize & 1) {
5536 raise_encode_exception(&exc, encoding,
5537 str, pos - 1, pos,
5538 "surrogates not allowed");
5539 goto error;
5540 }
5541 moreunits = repsize / 2;
5542 }
5543 else {
5544 assert(PyUnicode_Check(rep));
5545 if (PyUnicode_READY(rep) < 0)
5546 goto error;
5547 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5548 if (!PyUnicode_IS_ASCII(rep)) {
5549 raise_encode_exception(&exc, encoding,
5550 str, pos - 1, pos,
5551 "surrogates not allowed");
5552 goto error;
5553 }
5554 }
5555
5556 /* two bytes are reserved for each surrogate */
5557 if (moreunits > 1) {
5558 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5559 Py_ssize_t morebytes = 2 * (moreunits - 1);
5560 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5561 /* integer overflow */
5562 PyErr_NoMemory();
5563 goto error;
5564 }
5565 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5566 goto error;
5567 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5568 }
5569
5570 if (PyBytes_Check(rep)) {
5571 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5572 out += moreunits;
5573 } else /* rep is unicode */ {
5574 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5575 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5576 &out, native_ordering);
5577 }
5578
5579 Py_CLEAR(rep);
5580 }
5581
5582 /* Cut back to size actually needed. This is necessary for, for example,
5583 encoding of a string containing isolated surrogates and the 'ignore' handler
5584 is used. */
5585 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5586 if (nsize != PyBytes_GET_SIZE(v))
5587 _PyBytes_Resize(&v, nsize);
5588 Py_XDECREF(errorHandler);
5589 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005590 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005591 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005592 error:
5593 Py_XDECREF(rep);
5594 Py_XDECREF(errorHandler);
5595 Py_XDECREF(exc);
5596 Py_XDECREF(v);
5597 return NULL;
5598#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599}
5600
Alexander Belopolsky40018472011-02-26 01:02:56 +00005601PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005602PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5603 Py_ssize_t size,
5604 const char *errors,
5605 int byteorder)
5606{
5607 PyObject *result;
5608 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5609 if (tmp == NULL)
5610 return NULL;
5611 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5612 Py_DECREF(tmp);
5613 return result;
5614}
5615
5616PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005617PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620}
5621
5622/* --- Unicode Escape Codec ----------------------------------------------- */
5623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5625 if all the escapes in the string make it still a valid ASCII string.
5626 Returns -1 if any escapes were found which cause the string to
5627 pop out of ASCII range. Otherwise returns the length of the
5628 required buffer to hold the string.
5629 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005630static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5632{
5633 const unsigned char *p = (const unsigned char *)s;
5634 const unsigned char *end = p + size;
5635 Py_ssize_t length = 0;
5636
5637 if (size < 0)
5638 return -1;
5639
5640 for (; p < end; ++p) {
5641 if (*p > 127) {
5642 /* Non-ASCII */
5643 return -1;
5644 }
5645 else if (*p != '\\') {
5646 /* Normal character */
5647 ++length;
5648 }
5649 else {
5650 /* Backslash-escape, check next char */
5651 ++p;
5652 /* Escape sequence reaches till end of string or
5653 non-ASCII follow-up. */
5654 if (p >= end || *p > 127)
5655 return -1;
5656 switch (*p) {
5657 case '\n':
5658 /* backslash + \n result in zero characters */
5659 break;
5660 case '\\': case '\'': case '\"':
5661 case 'b': case 'f': case 't':
5662 case 'n': case 'r': case 'v': case 'a':
5663 ++length;
5664 break;
5665 case '0': case '1': case '2': case '3':
5666 case '4': case '5': case '6': case '7':
5667 case 'x': case 'u': case 'U': case 'N':
5668 /* these do not guarantee ASCII characters */
5669 return -1;
5670 default:
5671 /* count the backslash + the other character */
5672 length += 2;
5673 }
5674 }
5675 }
5676 return length;
5677}
5678
Fredrik Lundh06d12682001-01-24 07:59:11 +00005679static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005680
Alexander Belopolsky40018472011-02-26 01:02:56 +00005681PyObject *
5682PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005683 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005684 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005687 Py_ssize_t startinpos;
5688 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 char* message;
5692 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 PyObject *errorHandler = NULL;
5694 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005696
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005697 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005698 if (len == 0)
5699 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005700
5701 /* After length_of_escaped_ascii_string() there are two alternatives,
5702 either the string is pure ASCII with named escapes like \n, etc.
5703 and we determined it's exact size (common case)
5704 or it contains \x, \u, ... escape sequences. then we create a
5705 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005706 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005708 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 }
5710 else {
5711 /* Escaped strings will always be longer than the resulting
5712 Unicode string, so we start with size here and then reduce the
5713 length after conversion to the true value.
5714 (but if the error callback returns a long replacement string
5715 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005716 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005717 }
5718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005720 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005722
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 while (s < end) {
5724 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005725 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
5728 /* Non-escape characters are interpreted as Unicode ordinals */
5729 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005730 x = (unsigned char)*s;
5731 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005732 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 continue;
5735 }
5736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 /* \ - Escapes */
5739 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005740 c = *s++;
5741 if (s > end)
5742 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005743
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005744 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005747#define WRITECHAR(ch) \
5748 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005749 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005751 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005752
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005754 case '\\': WRITECHAR('\\'); break;
5755 case '\'': WRITECHAR('\''); break;
5756 case '\"': WRITECHAR('\"'); break;
5757 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005758 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 case 'f': WRITECHAR('\014'); break;
5760 case 't': WRITECHAR('\t'); break;
5761 case 'n': WRITECHAR('\n'); break;
5762 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005763 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005764 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005765 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 case '0': case '1': case '2': case '3':
5770 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005771 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005772 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005773 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005774 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005775 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 break;
5779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* hex escapes */
5781 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005783 digits = 2;
5784 message = "truncated \\xXX escape";
5785 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005789 digits = 4;
5790 message = "truncated \\uXXXX escape";
5791 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005794 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005795 digits = 8;
5796 message = "truncated \\UXXXXXXXX escape";
5797 hexescape:
5798 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005799 if (end - s < digits) {
5800 /* count only hex digits */
5801 for (; s < end; ++s) {
5802 c = (unsigned char)*s;
5803 if (!Py_ISXDIGIT(c))
5804 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005805 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005806 goto error;
5807 }
5808 for (; digits--; ++s) {
5809 c = (unsigned char)*s;
5810 if (!Py_ISXDIGIT(c))
5811 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005812 chr = (chr<<4) & ~0xF;
5813 if (c >= '0' && c <= '9')
5814 chr += c - '0';
5815 else if (c >= 'a' && c <= 'f')
5816 chr += 10 + c - 'a';
5817 else
5818 chr += 10 + c - 'A';
5819 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005820 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 /* _decoding_error will have already written into the
5822 target buffer. */
5823 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005824 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005825 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005826 message = "illegal Unicode character";
5827 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005828 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005829 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 break;
5831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 case 'N':
5834 message = "malformed \\N character escape";
5835 if (ucnhash_CAPI == NULL) {
5836 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5838 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 if (ucnhash_CAPI == NULL)
5840 goto ucnhashError;
5841 }
5842 if (*s == '{') {
5843 const char *start = s+1;
5844 /* look for the closing brace */
5845 while (*s != '}' && s < end)
5846 s++;
5847 if (s > start && s < end && *s == '}') {
5848 /* found a name. look it up in the unicode database */
5849 message = "unknown Unicode character name";
5850 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005851 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005852 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005853 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005854 goto store;
5855 }
5856 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005857 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005858
5859 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005860 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 message = "\\ at end of string";
5862 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005863 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005864 }
5865 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005866 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005867 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005868 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005869 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005871 continue;
5872
5873 error:
5874 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005875 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005876 errors, &errorHandler,
5877 "unicodeescape", message,
5878 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005879 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005880 goto onError;
5881 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005883#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005884
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005885 Py_XDECREF(errorHandler);
5886 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005887 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005888
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005890 PyErr_SetString(
5891 PyExc_UnicodeError,
5892 "\\N escapes not supported (can't load unicodedata module)"
5893 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005897 return NULL;
5898
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 Py_XDECREF(errorHandler);
5902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 return NULL;
5904}
5905
5906/* Return a Unicode-Escape string version of the Unicode object.
5907
5908 If quotes is true, the string is enclosed in u"" or u'' quotes as
5909 appropriate.
5910
5911*/
5912
Alexander Belopolsky40018472011-02-26 01:02:56 +00005913PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005917 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 int kind;
5920 void *data;
5921 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922
Ezio Melottie7f90372012-10-05 03:33:31 +03005923 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005924 escape.
5925
Ezio Melottie7f90372012-10-05 03:33:31 +03005926 For UCS1 strings it's '\xxx', 4 bytes per source character.
5927 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5928 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005929 */
5930
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 if (!PyUnicode_Check(unicode)) {
5932 PyErr_BadArgument();
5933 return NULL;
5934 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005935 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936 return NULL;
5937 len = PyUnicode_GET_LENGTH(unicode);
5938 kind = PyUnicode_KIND(unicode);
5939 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005940 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5942 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5943 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5944 }
5945
5946 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 return PyBytes_FromStringAndSize(NULL, 0);
5948
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005951
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 if (repr == NULL)
5957 return NULL;
5958
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005959 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005962 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005963
Walter Dörwald79e913e2007-05-12 11:08:06 +00005964 /* Escape backslashes */
5965 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 *p++ = '\\';
5967 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005968 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005971 /* Map 21-bit characters to '\U00xxxxxx' */
5972 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005973 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005974 *p++ = '\\';
5975 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005976 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5982 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5983 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005985 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005988 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 *p++ = '\\';
5990 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005991 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5992 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5993 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5994 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005996
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005997 /* Map special whitespace to '\t', \n', '\r' */
5998 else if (ch == '\t') {
5999 *p++ = '\\';
6000 *p++ = 't';
6001 }
6002 else if (ch == '\n') {
6003 *p++ = '\\';
6004 *p++ = 'n';
6005 }
6006 else if (ch == '\r') {
6007 *p++ = '\\';
6008 *p++ = 'r';
6009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006010
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006011 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006012 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006014 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006015 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6016 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006017 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006018
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 /* Copy everything else as-is */
6020 else
6021 *p++ = (char) ch;
6022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 assert(p - PyBytes_AS_STRING(repr) > 0);
6025 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6026 return NULL;
6027 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
Alexander Belopolsky40018472011-02-26 01:02:56 +00006030PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6032 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 PyObject *result;
6035 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6036 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 result = PyUnicode_AsUnicodeEscapeString(tmp);
6039 Py_DECREF(tmp);
6040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041}
6042
6043/* --- Raw Unicode Escape Codec ------------------------------------------- */
6044
Alexander Belopolsky40018472011-02-26 01:02:56 +00006045PyObject *
6046PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006047 Py_ssize_t size,
6048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006051 Py_ssize_t startinpos;
6052 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 const char *end;
6055 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 PyObject *errorHandler = NULL;
6057 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006058
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006059 if (size == 0)
6060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 /* Escaped strings will always be longer than the resulting
6063 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 length after conversion to the true value. (But decoding error
6065 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006066 _PyUnicodeWriter_Init(&writer);
6067 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 end = s + size;
6070 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 unsigned char c;
6072 Py_UCS4 x;
6073 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006074 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 /* Non-escape characters are interpreted as Unicode ordinals */
6077 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006078 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006079 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006080 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 startinpos = s-starts;
6084
6085 /* \u-escapes are only interpreted iff the number of leading
6086 backslashes if odd */
6087 bs = s;
6088 for (;s < end;) {
6089 if (*s != '\\')
6090 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006091 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006092 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006093 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 }
6095 if (((s - bs) & 1) == 0 ||
6096 s >= end ||
6097 (*s != 'u' && *s != 'U')) {
6098 continue;
6099 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006100 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 count = *s=='u' ? 4 : 8;
6102 s++;
6103
6104 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 for (x = 0, i = 0; i < count; ++i, ++s) {
6106 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006107 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006109 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 errors, &errorHandler,
6111 "rawunicodeescape", "truncated \\uXXXX",
6112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006113 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 goto onError;
6115 goto nextByte;
6116 }
6117 x = (x<<4) & ~0xF;
6118 if (c >= '0' && c <= '9')
6119 x += c - '0';
6120 else if (c >= 'a' && c <= 'f')
6121 x += 10 + c - 'a';
6122 else
6123 x += 10 + c - 'A';
6124 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006125 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006126 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006127 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006128 }
6129 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006130 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006131 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006132 errors, &errorHandler,
6133 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006135 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 nextByte:
6139 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006143 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006144
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006146 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return NULL;
6150}
6151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006156 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 char *p;
6158 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 Py_ssize_t expandsize, pos;
6160 int kind;
6161 void *data;
6162 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 if (!PyUnicode_Check(unicode)) {
6165 PyErr_BadArgument();
6166 return NULL;
6167 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006168 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 return NULL;
6170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
6172 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006173 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6174 bytes, and 1 byte characters 4. */
6175 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006176
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (repr == NULL)
6182 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006184 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006187 for (pos = 0; pos < len; pos++) {
6188 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 /* Map 32-bit characters to '\Uxxxxxxxx' */
6190 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006191 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006192 *p++ = '\\';
6193 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006194 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6195 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6199 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6201 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006202 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006204 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 *p++ = '\\';
6206 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006207 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6208 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6210 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 /* Copy everything else as-is */
6213 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = (char) ch;
6215 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006216
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217 assert(p > q);
6218 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006219 return NULL;
6220 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Alexander Belopolsky40018472011-02-26 01:02:56 +00006223PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6225 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006227 PyObject *result;
6228 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6229 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006230 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6232 Py_DECREF(tmp);
6233 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234}
6235
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236/* --- Unicode Internal Codec ------------------------------------------- */
6237
Alexander Belopolsky40018472011-02-26 01:02:56 +00006238PyObject *
6239_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006240 Py_ssize_t size,
6241 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006242{
6243 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006244 Py_ssize_t startinpos;
6245 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006246 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 const char *end;
6248 const char *reason;
6249 PyObject *errorHandler = NULL;
6250 PyObject *exc = NULL;
6251
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006253 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 1))
6255 return NULL;
6256
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006257 if (size == 0)
6258 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006259
Victor Stinner8f674cc2013-04-17 23:02:17 +02006260 _PyUnicodeWriter_Init(&writer);
6261 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6262 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006264 }
6265 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006266
Victor Stinner8f674cc2013-04-17 23:02:17 +02006267 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006269 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006270 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006271 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006272 endinpos = end-starts;
6273 reason = "truncated input";
6274 goto error;
6275 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006276 /* We copy the raw representation one byte at a time because the
6277 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006278 ((char *) &uch)[0] = s[0];
6279 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006280#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006281 ((char *) &uch)[2] = s[2];
6282 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006283#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006284 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006285#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 /* We have to sanity check the raw data, otherwise doom looms for
6287 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006288 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006289 endinpos = s - starts + Py_UNICODE_SIZE;
6290 reason = "illegal code point (> 0x10FFFF)";
6291 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006292 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006293#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294 s += Py_UNICODE_SIZE;
6295#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006296 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006297 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006298 Py_UNICODE uch2;
6299 ((char *) &uch2)[0] = s[0];
6300 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006301 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006302 {
Victor Stinner551ac952011-11-29 22:58:13 +01006303 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305 }
6306 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006307#endif
6308
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006309 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006310 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006311 continue;
6312
6313 error:
6314 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006315 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006316 errors, &errorHandler,
6317 "unicode_internal", reason,
6318 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006319 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006320 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006321 }
6322
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 Py_XDECREF(errorHandler);
6324 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006325 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006328 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 Py_XDECREF(errorHandler);
6330 Py_XDECREF(exc);
6331 return NULL;
6332}
6333
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334/* --- Latin-1 Codec ------------------------------------------------------ */
6335
Alexander Belopolsky40018472011-02-26 01:02:56 +00006336PyObject *
6337PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006338 Py_ssize_t size,
6339 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006342 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006346static void
6347make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006348 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006349 PyObject *unicode,
6350 Py_ssize_t startpos, Py_ssize_t endpos,
6351 const char *reason)
6352{
6353 if (*exceptionObject == NULL) {
6354 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006355 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006356 encoding, unicode, startpos, endpos, reason);
6357 }
6358 else {
6359 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6360 goto onError;
6361 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6362 goto onError;
6363 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6364 goto onError;
6365 return;
6366 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006367 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006368 }
6369}
6370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006372static void
6373raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006374 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006375 PyObject *unicode,
6376 Py_ssize_t startpos, Py_ssize_t endpos,
6377 const char *reason)
6378{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006379 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006380 encoding, unicode, startpos, endpos, reason);
6381 if (*exceptionObject != NULL)
6382 PyCodec_StrictErrors(*exceptionObject);
6383}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384
6385/* error handling callback helper:
6386 build arguments, call the callback and check the arguments,
6387 put the result into newpos and return the replacement string, which
6388 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006389static PyObject *
6390unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006391 PyObject **errorHandler,
6392 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006394 Py_ssize_t startpos, Py_ssize_t endpos,
6395 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006397 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 PyObject *restuple;
6400 PyObject *resunicode;
6401
6402 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 }
6407
Benjamin Petersonbac79492012-01-14 13:34:47 -05006408 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 return NULL;
6410 len = PyUnicode_GET_LENGTH(unicode);
6411
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006412 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006413 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416
6417 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006422 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 Py_DECREF(restuple);
6424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006426 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 &resunicode, newpos)) {
6428 Py_DECREF(restuple);
6429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006431 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6432 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6433 Py_DECREF(restuple);
6434 return NULL;
6435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006437 *newpos = len + *newpos;
6438 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6440 Py_DECREF(restuple);
6441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 Py_INCREF(resunicode);
6444 Py_DECREF(restuple);
6445 return resunicode;
6446}
6447
Alexander Belopolsky40018472011-02-26 01:02:56 +00006448static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006450 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006451 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453 /* input state */
6454 Py_ssize_t pos=0, size;
6455 int kind;
6456 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 /* output object */
6458 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 /* pointer into the output */
6460 char *str;
6461 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006463 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6464 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 PyObject *errorHandler = NULL;
6466 PyObject *exc = NULL;
6467 /* the following variable is used for caching string comparisons
6468 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6469 int known_errorHandler = -1;
6470
Benjamin Petersonbac79492012-01-14 13:34:47 -05006471 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 return NULL;
6473 size = PyUnicode_GET_LENGTH(unicode);
6474 kind = PyUnicode_KIND(unicode);
6475 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 /* allocate enough for a simple encoding without
6477 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006478 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006479 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006480 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006481 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006482 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006483 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 ressize = size;
6485
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 while (pos < size) {
6487 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 /* can we encode this? */
6490 if (c<limit) {
6491 /* no overflow check, because we know that the space is enough */
6492 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006494 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 Py_ssize_t requiredsize;
6497 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 Py_ssize_t collstart = pos;
6501 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 ++collend;
6505 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6506 if (known_errorHandler==-1) {
6507 if ((errors==NULL) || (!strcmp(errors, "strict")))
6508 known_errorHandler = 1;
6509 else if (!strcmp(errors, "replace"))
6510 known_errorHandler = 2;
6511 else if (!strcmp(errors, "ignore"))
6512 known_errorHandler = 3;
6513 else if (!strcmp(errors, "xmlcharrefreplace"))
6514 known_errorHandler = 4;
6515 else
6516 known_errorHandler = 0;
6517 }
6518 switch (known_errorHandler) {
6519 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006520 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 goto onError;
6522 case 2: /* replace */
6523 while (collstart++<collend)
6524 *str++ = '?'; /* fall through */
6525 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 break;
6528 case 4: /* xmlcharrefreplace */
6529 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 /* determine replacement size */
6531 for (i = collstart, repsize = 0; i < collend; ++i) {
6532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6533 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006545 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006546 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006550 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 if (requiredsize > ressize) {
6552 if (requiredsize<2*ressize)
6553 requiredsize = 2*ressize;
6554 if (_PyBytes_Resize(&res, requiredsize))
6555 goto onError;
6556 str = PyBytes_AS_STRING(res) + respos;
6557 ressize = requiredsize;
6558 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 /* generate replacement */
6560 for (i = collstart; i < collend; ++i) {
6561 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006563 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 break;
6565 default:
6566 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006567 encoding, reason, unicode, &exc,
6568 collstart, collend, &newpos);
6569 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006570 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006572 if (PyBytes_Check(repunicode)) {
6573 /* Directly copy bytes result to output. */
6574 repsize = PyBytes_Size(repunicode);
6575 if (repsize > 1) {
6576 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006577 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006578 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6579 Py_DECREF(repunicode);
6580 goto onError;
6581 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006582 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006583 ressize += repsize-1;
6584 }
6585 memcpy(str, PyBytes_AsString(repunicode), repsize);
6586 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006588 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006589 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* need more space? (at least enough for what we
6592 have+the replacement+the rest of the string, so
6593 we won't have to check space for encodable characters) */
6594 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595 repsize = PyUnicode_GET_LENGTH(repunicode);
6596 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 if (requiredsize > ressize) {
6598 if (requiredsize<2*ressize)
6599 requiredsize = 2*ressize;
6600 if (_PyBytes_Resize(&res, requiredsize)) {
6601 Py_DECREF(repunicode);
6602 goto onError;
6603 }
6604 str = PyBytes_AS_STRING(res) + respos;
6605 ressize = requiredsize;
6606 }
6607 /* check if there is anything unencodable in the replacement
6608 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006609 for (i = 0; repsize-->0; ++i, ++str) {
6610 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006612 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 Py_DECREF(repunicode);
6615 goto onError;
6616 }
6617 *str = (char)c;
6618 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006622 }
6623 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006624 /* Resize if we allocated to much */
6625 size = str - PyBytes_AS_STRING(res);
6626 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006627 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006628 if (_PyBytes_Resize(&res, size) < 0)
6629 goto onError;
6630 }
6631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 Py_XDECREF(errorHandler);
6633 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006634 return res;
6635
6636 onError:
6637 Py_XDECREF(res);
6638 Py_XDECREF(errorHandler);
6639 Py_XDECREF(exc);
6640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641}
6642
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644PyObject *
6645PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 Py_ssize_t size,
6647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 PyObject *result;
6650 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6651 if (unicode == NULL)
6652 return NULL;
6653 result = unicode_encode_ucs1(unicode, errors, 256);
6654 Py_DECREF(unicode);
6655 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656}
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
6661 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 PyErr_BadArgument();
6663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665 if (PyUnicode_READY(unicode) == -1)
6666 return NULL;
6667 /* Fast path: if it is a one-byte string, construct
6668 bytes object directly. */
6669 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6670 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6671 PyUnicode_GET_LENGTH(unicode));
6672 /* Non-Latin-1 characters present. Defer to above function to
6673 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675}
6676
6677PyObject*
6678PyUnicode_AsLatin1String(PyObject *unicode)
6679{
6680 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
6683/* --- 7-bit ASCII Codec -------------------------------------------------- */
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685PyObject *
6686PyUnicode_DecodeASCII(const char *s,
6687 Py_ssize_t size,
6688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006691 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006692 int kind;
6693 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t startinpos;
6695 Py_ssize_t endinpos;
6696 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 const char *e;
6698 PyObject *errorHandler = NULL;
6699 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006702 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006705 if (size == 1 && (unsigned char)s[0] < 128)
6706 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006707
Victor Stinner8f674cc2013-04-17 23:02:17 +02006708 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006709 writer.min_length = size;
6710 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006711 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006714 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006715 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 writer.pos = outpos;
6717 if (writer.pos == size)
6718 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006719
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006720 s += writer.pos;
6721 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006723 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006725 PyUnicode_WRITE(kind, data, writer.pos, c);
6726 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 ++s;
6728 }
6729 else {
6730 startinpos = s-starts;
6731 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006732 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 errors, &errorHandler,
6734 "ascii", "ordinal not in range(128)",
6735 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006736 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 kind = writer.kind;
6739 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 Py_XDECREF(errorHandler);
6743 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006744 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006745
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006747 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 Py_XDECREF(errorHandler);
6749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
6751}
6752
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006754PyObject *
6755PyUnicode_EncodeASCII(const Py_UNICODE *p,
6756 Py_ssize_t size,
6757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 PyObject *result;
6760 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6761 if (unicode == NULL)
6762 return NULL;
6763 result = unicode_encode_ucs1(unicode, errors, 128);
6764 Py_DECREF(unicode);
6765 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766}
6767
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006769_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 PyErr_BadArgument();
6773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 if (PyUnicode_READY(unicode) == -1)
6776 return NULL;
6777 /* Fast path: if it is an ASCII-only string, construct bytes object
6778 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006779 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6781 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783}
6784
6785PyObject *
6786PyUnicode_AsASCIIString(PyObject *unicode)
6787{
6788 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Victor Stinner99b95382011-07-04 14:23:54 +02006791#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006793/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006795#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796#define NEED_RETRY
6797#endif
6798
Victor Stinner3a50e702011-10-18 21:21:00 +02006799#ifndef WC_ERR_INVALID_CHARS
6800# define WC_ERR_INVALID_CHARS 0x0080
6801#endif
6802
6803static char*
6804code_page_name(UINT code_page, PyObject **obj)
6805{
6806 *obj = NULL;
6807 if (code_page == CP_ACP)
6808 return "mbcs";
6809 if (code_page == CP_UTF7)
6810 return "CP_UTF7";
6811 if (code_page == CP_UTF8)
6812 return "CP_UTF8";
6813
6814 *obj = PyBytes_FromFormat("cp%u", code_page);
6815 if (*obj == NULL)
6816 return NULL;
6817 return PyBytes_AS_STRING(*obj);
6818}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
Victor Stinner3a50e702011-10-18 21:21:00 +02006820static DWORD
6821decode_code_page_flags(UINT code_page)
6822{
6823 if (code_page == CP_UTF7) {
6824 /* The CP_UTF7 decoder only supports flags=0 */
6825 return 0;
6826 }
6827 else
6828 return MB_ERR_INVALID_CHARS;
6829}
6830
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006832 * Decode a byte string from a Windows code page into unicode object in strict
6833 * mode.
6834 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006835 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6836 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006838static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006839decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006840 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006841 const char *in,
6842 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843{
Victor Stinner3a50e702011-10-18 21:21:00 +02006844 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006845 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847
6848 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006849 assert(insize > 0);
6850 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6851 if (outsize <= 0)
6852 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
6854 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006856 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006857 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 if (*v == NULL)
6859 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006860 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861 }
6862 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006865 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 }
6869
6870 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6872 if (outsize <= 0)
6873 goto error;
6874 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006875
Victor Stinner3a50e702011-10-18 21:21:00 +02006876error:
6877 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6878 return -2;
6879 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006880 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881}
6882
Victor Stinner3a50e702011-10-18 21:21:00 +02006883/*
6884 * Decode a byte string from a code page into unicode object with an error
6885 * handler.
6886 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006887 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 * UnicodeDecodeError exception and returns -1 on error.
6889 */
6890static int
6891decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 PyObject **v,
6893 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006894 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006895{
6896 const char *startin = in;
6897 const char *endin = in + size;
6898 const DWORD flags = decode_code_page_flags(code_page);
6899 /* Ideally, we should get reason from FormatMessage. This is the Windows
6900 2000 English version of the message. */
6901 const char *reason = "No mapping for the Unicode character exists "
6902 "in the target code page.";
6903 /* each step cannot decode more than 1 character, but a character can be
6904 represented as a surrogate pair */
6905 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006906 int insize;
6907 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 PyObject *errorHandler = NULL;
6909 PyObject *exc = NULL;
6910 PyObject *encoding_obj = NULL;
6911 char *encoding;
6912 DWORD err;
6913 int ret = -1;
6914
6915 assert(size > 0);
6916
6917 encoding = code_page_name(code_page, &encoding_obj);
6918 if (encoding == NULL)
6919 return -1;
6920
Victor Stinner7d00cc12014-03-17 23:08:06 +01006921 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006922 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6923 UnicodeDecodeError. */
6924 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6925 if (exc != NULL) {
6926 PyCodec_StrictErrors(exc);
6927 Py_CLEAR(exc);
6928 }
6929 goto error;
6930 }
6931
6932 if (*v == NULL) {
6933 /* Create unicode object */
6934 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6935 PyErr_NoMemory();
6936 goto error;
6937 }
Victor Stinnerab595942011-12-17 04:59:06 +01006938 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006939 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 if (*v == NULL)
6941 goto error;
6942 startout = PyUnicode_AS_UNICODE(*v);
6943 }
6944 else {
6945 /* Extend unicode object */
6946 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6947 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6948 PyErr_NoMemory();
6949 goto error;
6950 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006951 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006952 goto error;
6953 startout = PyUnicode_AS_UNICODE(*v) + n;
6954 }
6955
6956 /* Decode the byte string character per character */
6957 out = startout;
6958 while (in < endin)
6959 {
6960 /* Decode a character */
6961 insize = 1;
6962 do
6963 {
6964 outsize = MultiByteToWideChar(code_page, flags,
6965 in, insize,
6966 buffer, Py_ARRAY_LENGTH(buffer));
6967 if (outsize > 0)
6968 break;
6969 err = GetLastError();
6970 if (err != ERROR_NO_UNICODE_TRANSLATION
6971 && err != ERROR_INSUFFICIENT_BUFFER)
6972 {
6973 PyErr_SetFromWindowsErr(0);
6974 goto error;
6975 }
6976 insize++;
6977 }
6978 /* 4=maximum length of a UTF-8 sequence */
6979 while (insize <= 4 && (in + insize) <= endin);
6980
6981 if (outsize <= 0) {
6982 Py_ssize_t startinpos, endinpos, outpos;
6983
Victor Stinner7d00cc12014-03-17 23:08:06 +01006984 /* last character in partial decode? */
6985 if (in + insize >= endin && !final)
6986 break;
6987
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 startinpos = in - startin;
6989 endinpos = startinpos + 1;
6990 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006991 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 errors, &errorHandler,
6993 encoding, reason,
6994 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006995 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 {
6997 goto error;
6998 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006999 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 }
7001 else {
7002 in += insize;
7003 memcpy(out, buffer, outsize * sizeof(wchar_t));
7004 out += outsize;
7005 }
7006 }
7007
7008 /* write a NUL character at the end */
7009 *out = 0;
7010
7011 /* Extend unicode object */
7012 outsize = out - startout;
7013 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007014 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 goto error;
Victor Stinner7d00cc12014-03-17 23:08:06 +01007016 ret = in - startin;
Victor Stinner3a50e702011-10-18 21:21:00 +02007017
7018error:
7019 Py_XDECREF(encoding_obj);
7020 Py_XDECREF(errorHandler);
7021 Py_XDECREF(exc);
7022 return ret;
7023}
7024
Victor Stinner3a50e702011-10-18 21:21:00 +02007025static PyObject *
7026decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007027 const char *s, Py_ssize_t size,
7028 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029{
Victor Stinner76a31a62011-11-04 00:05:13 +01007030 PyObject *v = NULL;
7031 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
Victor Stinner3a50e702011-10-18 21:21:00 +02007033 if (code_page < 0) {
7034 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7035 return NULL;
7036 }
7037
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040
Victor Stinner76a31a62011-11-04 00:05:13 +01007041 do
7042 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007043#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007044 if (size > INT_MAX) {
7045 chunk_size = INT_MAX;
7046 final = 0;
7047 done = 0;
7048 }
7049 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007051 {
7052 chunk_size = (int)size;
7053 final = (consumed == NULL);
7054 done = 1;
7055 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056
Victor Stinner76a31a62011-11-04 00:05:13 +01007057 if (chunk_size == 0 && done) {
7058 if (v != NULL)
7059 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007061 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062
Victor Stinner76a31a62011-11-04 00:05:13 +01007063 converted = decode_code_page_strict(code_page, &v,
7064 s, chunk_size);
7065 if (converted == -2)
7066 converted = decode_code_page_errors(code_page, &v,
7067 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007068 errors, final);
7069 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007070
7071 if (converted < 0) {
7072 Py_XDECREF(v);
7073 return NULL;
7074 }
7075
7076 if (consumed)
7077 *consumed += converted;
7078
7079 s += converted;
7080 size -= converted;
7081 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007083 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084}
7085
Alexander Belopolsky40018472011-02-26 01:02:56 +00007086PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007087PyUnicode_DecodeCodePageStateful(int code_page,
7088 const char *s,
7089 Py_ssize_t size,
7090 const char *errors,
7091 Py_ssize_t *consumed)
7092{
7093 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7094}
7095
7096PyObject *
7097PyUnicode_DecodeMBCSStateful(const char *s,
7098 Py_ssize_t size,
7099 const char *errors,
7100 Py_ssize_t *consumed)
7101{
7102 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7103}
7104
7105PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007106PyUnicode_DecodeMBCS(const char *s,
7107 Py_ssize_t size,
7108 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007109{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7111}
7112
Victor Stinner3a50e702011-10-18 21:21:00 +02007113static DWORD
7114encode_code_page_flags(UINT code_page, const char *errors)
7115{
7116 if (code_page == CP_UTF8) {
7117 if (winver.dwMajorVersion >= 6)
7118 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7119 and later */
7120 return WC_ERR_INVALID_CHARS;
7121 else
7122 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7123 return 0;
7124 }
7125 else if (code_page == CP_UTF7) {
7126 /* CP_UTF7 only supports flags=0 */
7127 return 0;
7128 }
7129 else {
7130 if (errors != NULL && strcmp(errors, "replace") == 0)
7131 return 0;
7132 else
7133 return WC_NO_BEST_FIT_CHARS;
7134 }
7135}
7136
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 * Encode a Unicode string to a Windows code page into a byte string in strict
7139 * mode.
7140 *
7141 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007142 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007144static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007145encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007146 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148{
Victor Stinner554f3f02010-06-16 23:33:54 +00007149 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 BOOL *pusedDefaultChar = &usedDefaultChar;
7151 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007152 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007153 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007154 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 const DWORD flags = encode_code_page_flags(code_page, NULL);
7156 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007157 /* Create a substring so that we can get the UTF-16 representation
7158 of just the slice under consideration. */
7159 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160
Martin v. Löwis3d325192011-11-04 18:23:06 +01007161 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007162
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007164 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007166 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007167
Victor Stinner2fc507f2011-11-04 20:06:39 +01007168 substring = PyUnicode_Substring(unicode, offset, offset+len);
7169 if (substring == NULL)
7170 return -1;
7171 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7172 if (p == NULL) {
7173 Py_DECREF(substring);
7174 return -1;
7175 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007176 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007177
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007178 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007180 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 NULL, 0,
7182 NULL, pusedDefaultChar);
7183 if (outsize <= 0)
7184 goto error;
7185 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007186 if (pusedDefaultChar && *pusedDefaultChar) {
7187 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007189 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007190
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194 if (*outbytes == NULL) {
7195 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007197 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199 }
7200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 const Py_ssize_t n = PyBytes_Size(*outbytes);
7203 if (outsize > PY_SSIZE_T_MAX - n) {
7204 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007208 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7209 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007211 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213 }
7214
7215 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007217 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 out, outsize,
7219 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 if (outsize <= 0)
7222 goto error;
7223 if (pusedDefaultChar && *pusedDefaultChar)
7224 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007225 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007226
Victor Stinner3a50e702011-10-18 21:21:00 +02007227error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007228 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7230 return -2;
7231 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007232 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007233}
7234
Victor Stinner3a50e702011-10-18 21:21:00 +02007235/*
7236 * Encode a Unicode string to a Windows code page into a byte string using a
7237 * error handler.
7238 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007239 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 * -1 on other error.
7241 */
7242static int
7243encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007244 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007245 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007246{
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007248 Py_ssize_t pos = unicode_offset;
7249 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 /* Ideally, we should get reason from FormatMessage. This is the Windows
7251 2000 English version of the message. */
7252 const char *reason = "invalid character";
7253 /* 4=maximum length of a UTF-8 sequence */
7254 char buffer[4];
7255 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7256 Py_ssize_t outsize;
7257 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 PyObject *errorHandler = NULL;
7259 PyObject *exc = NULL;
7260 PyObject *encoding_obj = NULL;
7261 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007262 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 PyObject *rep;
7264 int ret = -1;
7265
7266 assert(insize > 0);
7267
7268 encoding = code_page_name(code_page, &encoding_obj);
7269 if (encoding == NULL)
7270 return -1;
7271
7272 if (errors == NULL || strcmp(errors, "strict") == 0) {
7273 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7274 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007275 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 if (exc != NULL) {
7277 PyCodec_StrictErrors(exc);
7278 Py_DECREF(exc);
7279 }
7280 Py_XDECREF(encoding_obj);
7281 return -1;
7282 }
7283
7284 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7285 pusedDefaultChar = &usedDefaultChar;
7286 else
7287 pusedDefaultChar = NULL;
7288
7289 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7290 PyErr_NoMemory();
7291 goto error;
7292 }
7293 outsize = insize * Py_ARRAY_LENGTH(buffer);
7294
7295 if (*outbytes == NULL) {
7296 /* Create string object */
7297 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7298 if (*outbytes == NULL)
7299 goto error;
7300 out = PyBytes_AS_STRING(*outbytes);
7301 }
7302 else {
7303 /* Extend string object */
7304 Py_ssize_t n = PyBytes_Size(*outbytes);
7305 if (n > PY_SSIZE_T_MAX - outsize) {
7306 PyErr_NoMemory();
7307 goto error;
7308 }
7309 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7310 goto error;
7311 out = PyBytes_AS_STRING(*outbytes) + n;
7312 }
7313
7314 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007315 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007317 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7318 wchar_t chars[2];
7319 int charsize;
7320 if (ch < 0x10000) {
7321 chars[0] = (wchar_t)ch;
7322 charsize = 1;
7323 }
7324 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007325 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7326 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007327 charsize = 2;
7328 }
7329
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007331 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 buffer, Py_ARRAY_LENGTH(buffer),
7333 NULL, pusedDefaultChar);
7334 if (outsize > 0) {
7335 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7336 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007337 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 memcpy(out, buffer, outsize);
7339 out += outsize;
7340 continue;
7341 }
7342 }
7343 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7344 PyErr_SetFromWindowsErr(0);
7345 goto error;
7346 }
7347
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 rep = unicode_encode_call_errorhandler(
7349 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007350 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007351 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 if (rep == NULL)
7353 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007355
7356 if (PyBytes_Check(rep)) {
7357 outsize = PyBytes_GET_SIZE(rep);
7358 if (outsize != 1) {
7359 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7360 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7361 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7362 Py_DECREF(rep);
7363 goto error;
7364 }
7365 out = PyBytes_AS_STRING(*outbytes) + offset;
7366 }
7367 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7368 out += outsize;
7369 }
7370 else {
7371 Py_ssize_t i;
7372 enum PyUnicode_Kind kind;
7373 void *data;
7374
Benjamin Petersonbac79492012-01-14 13:34:47 -05007375 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 Py_DECREF(rep);
7377 goto error;
7378 }
7379
7380 outsize = PyUnicode_GET_LENGTH(rep);
7381 if (outsize != 1) {
7382 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7383 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7384 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7385 Py_DECREF(rep);
7386 goto error;
7387 }
7388 out = PyBytes_AS_STRING(*outbytes) + offset;
7389 }
7390 kind = PyUnicode_KIND(rep);
7391 data = PyUnicode_DATA(rep);
7392 for (i=0; i < outsize; i++) {
7393 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7394 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007395 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007396 encoding, unicode,
7397 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 "unable to encode error handler result to ASCII");
7399 Py_DECREF(rep);
7400 goto error;
7401 }
7402 *out = (unsigned char)ch;
7403 out++;
7404 }
7405 }
7406 Py_DECREF(rep);
7407 }
7408 /* write a NUL byte */
7409 *out = 0;
7410 outsize = out - PyBytes_AS_STRING(*outbytes);
7411 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7412 if (_PyBytes_Resize(outbytes, outsize) < 0)
7413 goto error;
7414 ret = 0;
7415
7416error:
7417 Py_XDECREF(encoding_obj);
7418 Py_XDECREF(errorHandler);
7419 Py_XDECREF(exc);
7420 return ret;
7421}
7422
Victor Stinner3a50e702011-10-18 21:21:00 +02007423static PyObject *
7424encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007425 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 const char *errors)
7427{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007428 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007430 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007431 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007432
Benjamin Petersonbac79492012-01-14 13:34:47 -05007433 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007434 return NULL;
7435 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007436
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 if (code_page < 0) {
7438 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7439 return NULL;
7440 }
7441
Martin v. Löwis3d325192011-11-04 18:23:06 +01007442 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 return PyBytes_FromStringAndSize(NULL, 0);
7444
Victor Stinner7581cef2011-11-03 22:32:33 +01007445 offset = 0;
7446 do
7447 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007449 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450 chunks. */
7451 if (len > INT_MAX/2) {
7452 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007453 done = 0;
7454 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007455 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007457 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007458 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007459 done = 1;
7460 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007463 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007464 errors);
7465 if (ret == -2)
7466 ret = encode_code_page_errors(code_page, &outbytes,
7467 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007468 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007469 if (ret < 0) {
7470 Py_XDECREF(outbytes);
7471 return NULL;
7472 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473
Victor Stinner7581cef2011-11-03 22:32:33 +01007474 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 return outbytes;
7479}
7480
7481PyObject *
7482PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7483 Py_ssize_t size,
7484 const char *errors)
7485{
Victor Stinner7581cef2011-11-03 22:32:33 +01007486 PyObject *unicode, *res;
7487 unicode = PyUnicode_FromUnicode(p, size);
7488 if (unicode == NULL)
7489 return NULL;
7490 res = encode_code_page(CP_ACP, unicode, errors);
7491 Py_DECREF(unicode);
7492 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007493}
7494
7495PyObject *
7496PyUnicode_EncodeCodePage(int code_page,
7497 PyObject *unicode,
7498 const char *errors)
7499{
Victor Stinner7581cef2011-11-03 22:32:33 +01007500 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007501}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007502
Alexander Belopolsky40018472011-02-26 01:02:56 +00007503PyObject *
7504PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007505{
7506 if (!PyUnicode_Check(unicode)) {
7507 PyErr_BadArgument();
7508 return NULL;
7509 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007510 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007511}
7512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007513#undef NEED_RETRY
7514
Victor Stinner99b95382011-07-04 14:23:54 +02007515#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007516
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517/* --- Character Mapping Codec -------------------------------------------- */
7518
Victor Stinnerfb161b12013-04-18 01:44:27 +02007519static int
7520charmap_decode_string(const char *s,
7521 Py_ssize_t size,
7522 PyObject *mapping,
7523 const char *errors,
7524 _PyUnicodeWriter *writer)
7525{
7526 const char *starts = s;
7527 const char *e;
7528 Py_ssize_t startinpos, endinpos;
7529 PyObject *errorHandler = NULL, *exc = NULL;
7530 Py_ssize_t maplen;
7531 enum PyUnicode_Kind mapkind;
7532 void *mapdata;
7533 Py_UCS4 x;
7534 unsigned char ch;
7535
7536 if (PyUnicode_READY(mapping) == -1)
7537 return -1;
7538
7539 maplen = PyUnicode_GET_LENGTH(mapping);
7540 mapdata = PyUnicode_DATA(mapping);
7541 mapkind = PyUnicode_KIND(mapping);
7542
7543 e = s + size;
7544
7545 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7546 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7547 * is disabled in encoding aliases, latin1 is preferred because
7548 * its implementation is faster. */
7549 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7550 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7551 Py_UCS4 maxchar = writer->maxchar;
7552
7553 assert (writer->kind == PyUnicode_1BYTE_KIND);
7554 while (s < e) {
7555 ch = *s;
7556 x = mapdata_ucs1[ch];
7557 if (x > maxchar) {
7558 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7559 goto onError;
7560 maxchar = writer->maxchar;
7561 outdata = (Py_UCS1 *)writer->data;
7562 }
7563 outdata[writer->pos] = x;
7564 writer->pos++;
7565 ++s;
7566 }
7567 return 0;
7568 }
7569
7570 while (s < e) {
7571 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7572 enum PyUnicode_Kind outkind = writer->kind;
7573 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7574 if (outkind == PyUnicode_1BYTE_KIND) {
7575 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7576 Py_UCS4 maxchar = writer->maxchar;
7577 while (s < e) {
7578 ch = *s;
7579 x = mapdata_ucs2[ch];
7580 if (x > maxchar)
7581 goto Error;
7582 outdata[writer->pos] = x;
7583 writer->pos++;
7584 ++s;
7585 }
7586 break;
7587 }
7588 else if (outkind == PyUnicode_2BYTE_KIND) {
7589 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7590 while (s < e) {
7591 ch = *s;
7592 x = mapdata_ucs2[ch];
7593 if (x == 0xFFFE)
7594 goto Error;
7595 outdata[writer->pos] = x;
7596 writer->pos++;
7597 ++s;
7598 }
7599 break;
7600 }
7601 }
7602 ch = *s;
7603
7604 if (ch < maplen)
7605 x = PyUnicode_READ(mapkind, mapdata, ch);
7606 else
7607 x = 0xfffe; /* invalid value */
7608Error:
7609 if (x == 0xfffe)
7610 {
7611 /* undefined mapping */
7612 startinpos = s-starts;
7613 endinpos = startinpos+1;
7614 if (unicode_decode_call_errorhandler_writer(
7615 errors, &errorHandler,
7616 "charmap", "character maps to <undefined>",
7617 &starts, &e, &startinpos, &endinpos, &exc, &s,
7618 writer)) {
7619 goto onError;
7620 }
7621 continue;
7622 }
7623
7624 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7625 goto onError;
7626 ++s;
7627 }
7628 Py_XDECREF(errorHandler);
7629 Py_XDECREF(exc);
7630 return 0;
7631
7632onError:
7633 Py_XDECREF(errorHandler);
7634 Py_XDECREF(exc);
7635 return -1;
7636}
7637
7638static int
7639charmap_decode_mapping(const char *s,
7640 Py_ssize_t size,
7641 PyObject *mapping,
7642 const char *errors,
7643 _PyUnicodeWriter *writer)
7644{
7645 const char *starts = s;
7646 const char *e;
7647 Py_ssize_t startinpos, endinpos;
7648 PyObject *errorHandler = NULL, *exc = NULL;
7649 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007650 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007651
7652 e = s + size;
7653
7654 while (s < e) {
7655 ch = *s;
7656
7657 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7658 key = PyLong_FromLong((long)ch);
7659 if (key == NULL)
7660 goto onError;
7661
7662 item = PyObject_GetItem(mapping, key);
7663 Py_DECREF(key);
7664 if (item == NULL) {
7665 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7666 /* No mapping found means: mapping is undefined. */
7667 PyErr_Clear();
7668 goto Undefined;
7669 } else
7670 goto onError;
7671 }
7672
7673 /* Apply mapping */
7674 if (item == Py_None)
7675 goto Undefined;
7676 if (PyLong_Check(item)) {
7677 long value = PyLong_AS_LONG(item);
7678 if (value == 0xFFFE)
7679 goto Undefined;
7680 if (value < 0 || value > MAX_UNICODE) {
7681 PyErr_Format(PyExc_TypeError,
7682 "character mapping must be in range(0x%lx)",
7683 (unsigned long)MAX_UNICODE + 1);
7684 goto onError;
7685 }
7686
7687 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7688 goto onError;
7689 }
7690 else if (PyUnicode_Check(item)) {
7691 if (PyUnicode_READY(item) == -1)
7692 goto onError;
7693 if (PyUnicode_GET_LENGTH(item) == 1) {
7694 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7695 if (value == 0xFFFE)
7696 goto Undefined;
7697 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7698 goto onError;
7699 }
7700 else {
7701 writer->overallocate = 1;
7702 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7703 goto onError;
7704 }
7705 }
7706 else {
7707 /* wrong return value */
7708 PyErr_SetString(PyExc_TypeError,
7709 "character mapping must return integer, None or str");
7710 goto onError;
7711 }
7712 Py_CLEAR(item);
7713 ++s;
7714 continue;
7715
7716Undefined:
7717 /* undefined mapping */
7718 Py_CLEAR(item);
7719 startinpos = s-starts;
7720 endinpos = startinpos+1;
7721 if (unicode_decode_call_errorhandler_writer(
7722 errors, &errorHandler,
7723 "charmap", "character maps to <undefined>",
7724 &starts, &e, &startinpos, &endinpos, &exc, &s,
7725 writer)) {
7726 goto onError;
7727 }
7728 }
7729 Py_XDECREF(errorHandler);
7730 Py_XDECREF(exc);
7731 return 0;
7732
7733onError:
7734 Py_XDECREF(item);
7735 Py_XDECREF(errorHandler);
7736 Py_XDECREF(exc);
7737 return -1;
7738}
7739
Alexander Belopolsky40018472011-02-26 01:02:56 +00007740PyObject *
7741PyUnicode_DecodeCharmap(const char *s,
7742 Py_ssize_t size,
7743 PyObject *mapping,
7744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007746 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007747
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 /* Default to Latin-1 */
7749 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007753 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007754 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007755 writer.min_length = size;
7756 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007758
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007759 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007760 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7761 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007762 }
7763 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007764 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7765 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007767 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007768
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007770 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 return NULL;
7772}
7773
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774/* Charmap encoding: the lookup table */
7775
Alexander Belopolsky40018472011-02-26 01:02:56 +00007776struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 PyObject_HEAD
7778 unsigned char level1[32];
7779 int count2, count3;
7780 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007781};
7782
7783static PyObject*
7784encoding_map_size(PyObject *obj, PyObject* args)
7785{
7786 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007789}
7790
7791static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 PyDoc_STR("Return the size (in bytes) of this object") },
7794 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007795};
7796
7797static void
7798encoding_map_dealloc(PyObject* o)
7799{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007800 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007801}
7802
7803static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007804 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 "EncodingMap", /*tp_name*/
7806 sizeof(struct encoding_map), /*tp_basicsize*/
7807 0, /*tp_itemsize*/
7808 /* methods */
7809 encoding_map_dealloc, /*tp_dealloc*/
7810 0, /*tp_print*/
7811 0, /*tp_getattr*/
7812 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007813 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 0, /*tp_repr*/
7815 0, /*tp_as_number*/
7816 0, /*tp_as_sequence*/
7817 0, /*tp_as_mapping*/
7818 0, /*tp_hash*/
7819 0, /*tp_call*/
7820 0, /*tp_str*/
7821 0, /*tp_getattro*/
7822 0, /*tp_setattro*/
7823 0, /*tp_as_buffer*/
7824 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7825 0, /*tp_doc*/
7826 0, /*tp_traverse*/
7827 0, /*tp_clear*/
7828 0, /*tp_richcompare*/
7829 0, /*tp_weaklistoffset*/
7830 0, /*tp_iter*/
7831 0, /*tp_iternext*/
7832 encoding_map_methods, /*tp_methods*/
7833 0, /*tp_members*/
7834 0, /*tp_getset*/
7835 0, /*tp_base*/
7836 0, /*tp_dict*/
7837 0, /*tp_descr_get*/
7838 0, /*tp_descr_set*/
7839 0, /*tp_dictoffset*/
7840 0, /*tp_init*/
7841 0, /*tp_alloc*/
7842 0, /*tp_new*/
7843 0, /*tp_free*/
7844 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845};
7846
7847PyObject*
7848PyUnicode_BuildEncodingMap(PyObject* string)
7849{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 PyObject *result;
7851 struct encoding_map *mresult;
7852 int i;
7853 int need_dict = 0;
7854 unsigned char level1[32];
7855 unsigned char level2[512];
7856 unsigned char *mlevel1, *mlevel2, *mlevel3;
7857 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007858 int kind;
7859 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007860 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007863 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007864 PyErr_BadArgument();
7865 return NULL;
7866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007867 kind = PyUnicode_KIND(string);
7868 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007869 length = PyUnicode_GET_LENGTH(string);
7870 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871 memset(level1, 0xFF, sizeof level1);
7872 memset(level2, 0xFF, sizeof level2);
7873
7874 /* If there isn't a one-to-one mapping of NULL to \0,
7875 or if there are non-BMP characters, we need to use
7876 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007879 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 ch = PyUnicode_READ(kind, data, i);
7882 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 need_dict = 1;
7884 break;
7885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007886 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 /* unmapped character */
7888 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 l1 = ch >> 11;
7890 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 if (level1[l1] == 0xFF)
7892 level1[l1] = count2++;
7893 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007894 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895 }
7896
7897 if (count2 >= 0xFF || count3 >= 0xFF)
7898 need_dict = 1;
7899
7900 if (need_dict) {
7901 PyObject *result = PyDict_New();
7902 PyObject *key, *value;
7903 if (!result)
7904 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007905 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007906 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007907 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007908 if (!key || !value)
7909 goto failed1;
7910 if (PyDict_SetItem(result, key, value) == -1)
7911 goto failed1;
7912 Py_DECREF(key);
7913 Py_DECREF(value);
7914 }
7915 return result;
7916 failed1:
7917 Py_XDECREF(key);
7918 Py_XDECREF(value);
7919 Py_DECREF(result);
7920 return NULL;
7921 }
7922
7923 /* Create a three-level trie */
7924 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7925 16*count2 + 128*count3 - 1);
7926 if (!result)
7927 return PyErr_NoMemory();
7928 PyObject_Init(result, &EncodingMapType);
7929 mresult = (struct encoding_map*)result;
7930 mresult->count2 = count2;
7931 mresult->count3 = count3;
7932 mlevel1 = mresult->level1;
7933 mlevel2 = mresult->level23;
7934 mlevel3 = mresult->level23 + 16*count2;
7935 memcpy(mlevel1, level1, 32);
7936 memset(mlevel2, 0xFF, 16*count2);
7937 memset(mlevel3, 0, 128*count3);
7938 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007939 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007940 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007941 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7942 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943 /* unmapped character */
7944 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007945 o1 = ch>>11;
7946 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 i2 = 16*mlevel1[o1] + o2;
7948 if (mlevel2[i2] == 0xFF)
7949 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007950 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007951 i3 = 128*mlevel2[i2] + o3;
7952 mlevel3[i3] = i;
7953 }
7954 return result;
7955}
7956
7957static int
Victor Stinner22168992011-11-20 17:09:18 +01007958encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959{
7960 struct encoding_map *map = (struct encoding_map*)mapping;
7961 int l1 = c>>11;
7962 int l2 = (c>>7) & 0xF;
7963 int l3 = c & 0x7F;
7964 int i;
7965
Victor Stinner22168992011-11-20 17:09:18 +01007966 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 if (c == 0)
7969 return 0;
7970 /* level 1*/
7971 i = map->level1[l1];
7972 if (i == 0xFF) {
7973 return -1;
7974 }
7975 /* level 2*/
7976 i = map->level23[16*i+l2];
7977 if (i == 0xFF) {
7978 return -1;
7979 }
7980 /* level 3 */
7981 i = map->level23[16*map->count2 + 128*i + l3];
7982 if (i == 0) {
7983 return -1;
7984 }
7985 return i;
7986}
7987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988/* Lookup the character ch in the mapping. If the character
7989 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007990 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007991static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007992charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993{
Christian Heimes217cfd12007-12-02 14:31:20 +00007994 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007995 PyObject *x;
7996
7997 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 x = PyObject_GetItem(mapping, w);
8000 Py_DECREF(w);
8001 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8003 /* No mapping found means: mapping is undefined. */
8004 PyErr_Clear();
8005 x = Py_None;
8006 Py_INCREF(x);
8007 return x;
8008 } else
8009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008011 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008013 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 long value = PyLong_AS_LONG(x);
8015 if (value < 0 || value > 255) {
8016 PyErr_SetString(PyExc_TypeError,
8017 "character mapping must be in range(256)");
8018 Py_DECREF(x);
8019 return NULL;
8020 }
8021 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008023 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 /* wrong return value */
8027 PyErr_Format(PyExc_TypeError,
8028 "character mapping must return integer, bytes or None, not %.400s",
8029 x->ob_type->tp_name);
8030 Py_DECREF(x);
8031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 }
8033}
8034
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008036charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8039 /* exponentially overallocate to minimize reallocations */
8040 if (requiredsize < 2*outsize)
8041 requiredsize = 2*outsize;
8042 if (_PyBytes_Resize(outobj, requiredsize))
8043 return -1;
8044 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045}
8046
Benjamin Peterson14339b62009-01-31 16:36:08 +00008047typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008049} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008050/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008051 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052 space is available. Return a new reference to the object that
8053 was put in the output buffer, or Py_None, if the mapping was undefined
8054 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008055 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008056static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008057charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008058 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008059{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060 PyObject *rep;
8061 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008062 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063
Christian Heimes90aa7642007-12-19 02:45:37 +00008064 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 if (res == -1)
8068 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 if (outsize<requiredsize)
8070 if (charmapencode_resize(outobj, outpos, requiredsize))
8071 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008072 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 outstart[(*outpos)++] = (char)res;
8074 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075 }
8076
8077 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 Py_DECREF(rep);
8082 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 if (PyLong_Check(rep)) {
8085 Py_ssize_t requiredsize = *outpos+1;
8086 if (outsize<requiredsize)
8087 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8088 Py_DECREF(rep);
8089 return enc_EXCEPTION;
8090 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008091 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 else {
8095 const char *repchars = PyBytes_AS_STRING(rep);
8096 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8097 Py_ssize_t requiredsize = *outpos+repsize;
8098 if (outsize<requiredsize)
8099 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8100 Py_DECREF(rep);
8101 return enc_EXCEPTION;
8102 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008103 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 memcpy(outstart + *outpos, repchars, repsize);
8105 *outpos += repsize;
8106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008107 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 Py_DECREF(rep);
8109 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110}
8111
8112/* handle an error in PyUnicode_EncodeCharmap
8113 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008114static int
8115charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008116 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008118 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008119 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120{
8121 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008122 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008123 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008124 enum PyUnicode_Kind kind;
8125 void *data;
8126 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 Py_ssize_t collstartpos = *inpos;
8129 Py_ssize_t collendpos = *inpos+1;
8130 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 char *encoding = "charmap";
8132 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008135 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136
Benjamin Petersonbac79492012-01-14 13:34:47 -05008137 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008138 return -1;
8139 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 /* find all unencodable characters */
8141 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008143 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008144 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008145 val = encoding_map_lookup(ch, mapping);
8146 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 break;
8148 ++collendpos;
8149 continue;
8150 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008151
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008152 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8153 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 if (rep==NULL)
8155 return -1;
8156 else if (rep!=Py_None) {
8157 Py_DECREF(rep);
8158 break;
8159 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 }
8163 /* cache callback name lookup
8164 * (if not done yet, i.e. it's the first error) */
8165 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 if ((errors==NULL) || (!strcmp(errors, "strict")))
8167 *known_errorHandler = 1;
8168 else if (!strcmp(errors, "replace"))
8169 *known_errorHandler = 2;
8170 else if (!strcmp(errors, "ignore"))
8171 *known_errorHandler = 3;
8172 else if (!strcmp(errors, "xmlcharrefreplace"))
8173 *known_errorHandler = 4;
8174 else
8175 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008176 }
8177 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008179 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 return -1;
8181 case 2: /* replace */
8182 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 x = charmapencode_output('?', mapping, res, respos);
8184 if (x==enc_EXCEPTION) {
8185 return -1;
8186 }
8187 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008188 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 return -1;
8190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 }
8192 /* fall through */
8193 case 3: /* ignore */
8194 *inpos = collendpos;
8195 break;
8196 case 4: /* xmlcharrefreplace */
8197 /* generate replacement (temporarily (mis)uses p) */
8198 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 char buffer[2+29+1+1];
8200 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008201 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 for (cp = buffer; *cp; ++cp) {
8203 x = charmapencode_output(*cp, mapping, res, respos);
8204 if (x==enc_EXCEPTION)
8205 return -1;
8206 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008207 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return -1;
8209 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 }
8211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 *inpos = collendpos;
8213 break;
8214 default:
8215 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008216 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008220 if (PyBytes_Check(repunicode)) {
8221 /* Directly copy bytes result to output. */
8222 Py_ssize_t outsize = PyBytes_Size(*res);
8223 Py_ssize_t requiredsize;
8224 repsize = PyBytes_Size(repunicode);
8225 requiredsize = *respos + repsize;
8226 if (requiredsize > outsize)
8227 /* Make room for all additional bytes. */
8228 if (charmapencode_resize(res, respos, requiredsize)) {
8229 Py_DECREF(repunicode);
8230 return -1;
8231 }
8232 memcpy(PyBytes_AsString(*res) + *respos,
8233 PyBytes_AsString(repunicode), repsize);
8234 *respos += repsize;
8235 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008236 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008237 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008238 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008239 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008240 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008241 Py_DECREF(repunicode);
8242 return -1;
8243 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008244 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008245 data = PyUnicode_DATA(repunicode);
8246 kind = PyUnicode_KIND(repunicode);
8247 for (index = 0; index < repsize; index++) {
8248 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8249 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008251 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 return -1;
8253 }
8254 else if (x==enc_FAILED) {
8255 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008256 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 return -1;
8258 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008259 }
8260 *inpos = newpos;
8261 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 }
8263 return 0;
8264}
8265
Alexander Belopolsky40018472011-02-26 01:02:56 +00008266PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008267_PyUnicode_EncodeCharmap(PyObject *unicode,
8268 PyObject *mapping,
8269 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 /* output object */
8272 PyObject *res = NULL;
8273 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008274 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008275 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008277 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 PyObject *errorHandler = NULL;
8279 PyObject *exc = NULL;
8280 /* the following variable is used for caching string comparisons
8281 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8282 * 3=ignore, 4=xmlcharrefreplace */
8283 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008284 void *data;
8285 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286
Benjamin Petersonbac79492012-01-14 13:34:47 -05008287 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008288 return NULL;
8289 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008290 data = PyUnicode_DATA(unicode);
8291 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008292
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 /* Default to Latin-1 */
8294 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008295 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 /* allocate enough for a simple encoding without
8298 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008299 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 if (res == NULL)
8301 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008302 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008306 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 if (x==enc_EXCEPTION) /* error */
8310 goto onError;
8311 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008312 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 &exc,
8314 &known_errorHandler, &errorHandler, errors,
8315 &res, &respos)) {
8316 goto onError;
8317 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008318 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 else
8320 /* done with this character => adjust input position */
8321 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008325 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008326 if (_PyBytes_Resize(&res, respos) < 0)
8327 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 Py_XDECREF(exc);
8330 Py_XDECREF(errorHandler);
8331 return res;
8332
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334 Py_XDECREF(res);
8335 Py_XDECREF(exc);
8336 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 return NULL;
8338}
8339
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008340/* Deprecated */
8341PyObject *
8342PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8343 Py_ssize_t size,
8344 PyObject *mapping,
8345 const char *errors)
8346{
8347 PyObject *result;
8348 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8349 if (unicode == NULL)
8350 return NULL;
8351 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8352 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008353 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008354}
8355
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356PyObject *
8357PyUnicode_AsCharmapString(PyObject *unicode,
8358 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359{
8360 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 PyErr_BadArgument();
8362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008364 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365}
8366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368static void
8369make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008371 Py_ssize_t startpos, Py_ssize_t endpos,
8372 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 *exceptionObject = _PyUnicodeTranslateError_Create(
8376 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 }
8378 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8380 goto onError;
8381 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8382 goto onError;
8383 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8384 goto onError;
8385 return;
8386 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008387 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 }
8389}
8390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391/* error handling callback helper:
8392 build arguments, call the callback and check the arguments,
8393 put the result into newpos and return the replacement string, which
8394 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395static PyObject *
8396unicode_translate_call_errorhandler(const char *errors,
8397 PyObject **errorHandler,
8398 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008400 Py_ssize_t startpos, Py_ssize_t endpos,
8401 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008403 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008405 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 PyObject *restuple;
8407 PyObject *resunicode;
8408
8409 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 }
8414
8415 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419
8420 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008425 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 Py_DECREF(restuple);
8427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 }
8429 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 &resunicode, &i_newpos)) {
8431 Py_DECREF(restuple);
8432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008434 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 else
8437 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8440 Py_DECREF(restuple);
8441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 Py_INCREF(resunicode);
8444 Py_DECREF(restuple);
8445 return resunicode;
8446}
8447
8448/* Lookup the character ch in the mapping and put the result in result,
8449 which must be decrefed by the caller.
8450 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008451static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453{
Christian Heimes217cfd12007-12-02 14:31:20 +00008454 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455 PyObject *x;
8456
8457 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 x = PyObject_GetItem(mapping, w);
8460 Py_DECREF(w);
8461 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8463 /* No mapping found means: use 1:1 mapping. */
8464 PyErr_Clear();
8465 *result = NULL;
8466 return 0;
8467 } else
8468 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 }
8470 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 *result = x;
8472 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008474 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 long value = PyLong_AS_LONG(x);
8476 long max = PyUnicode_GetMax();
8477 if (value < 0 || value > max) {
8478 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008479 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 Py_DECREF(x);
8481 return -1;
8482 }
8483 *result = x;
8484 return 0;
8485 }
8486 else if (PyUnicode_Check(x)) {
8487 *result = x;
8488 return 0;
8489 }
8490 else {
8491 /* wrong return value */
8492 PyErr_SetString(PyExc_TypeError,
8493 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 Py_DECREF(x);
8495 return -1;
8496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497}
Victor Stinner1194ea02014-04-04 19:37:40 +02008498
8499/* lookup the character, write the result into the writer.
8500 Return 1 if the result was written into the writer, return 0 if the mapping
8501 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008503charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8504 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505{
Victor Stinner1194ea02014-04-04 19:37:40 +02008506 PyObject *item;
8507
8508 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008510
8511 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008513 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008516 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008518
8519 if (item == Py_None) {
8520 Py_DECREF(item);
8521 return 0;
8522 }
8523
8524 if (PyLong_Check(item)) {
8525 Py_UCS4 ch = (Py_UCS4)PyLong_AS_LONG(item);
8526 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8527 Py_DECREF(item);
8528 return -1;
8529 }
8530 Py_DECREF(item);
8531 return 1;
8532 }
8533
8534 if (!PyUnicode_Check(item)) {
8535 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008537 }
8538
8539 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8540 Py_DECREF(item);
8541 return -1;
8542 }
8543
8544 Py_DECREF(item);
8545 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546}
8547
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549_PyUnicode_TranslateCharmap(PyObject *input,
8550 PyObject *mapping,
8551 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008554 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555 Py_ssize_t size, i;
8556 int kind;
8557 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008558 _PyUnicodeWriter writer;
8559 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 char *reason = "character maps to <undefined>";
8561 PyObject *errorHandler = NULL;
8562 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008563 int ignore;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 PyErr_BadArgument();
8567 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 if (PyUnicode_READY(input) == -1)
8571 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008572 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 kind = PyUnicode_KIND(input);
8574 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575
8576 if (size == 0) {
8577 Py_INCREF(input);
8578 return input;
8579 }
8580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 /* allocate enough for a simple 1:1 translation without
8582 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008583 _PyUnicodeWriter_Init(&writer);
8584 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586
Victor Stinner1194ea02014-04-04 19:37:40 +02008587 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8588
8589 i = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008592 int translate;
8593 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8594 Py_ssize_t newpos;
8595 /* startpos for collecting untranslatable chars */
8596 Py_ssize_t collstart;
8597 Py_ssize_t collend;
8598 Py_ssize_t coll;
8599 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600
Victor Stinner1194ea02014-04-04 19:37:40 +02008601 ch = PyUnicode_READ(kind, data, i);
8602 translate = charmaptranslate_output(ch, mapping, &writer);
8603 if (translate < 0)
8604 goto onError;
8605
8606 if (translate != 0) {
8607 /* it worked => adjust input pointer */
8608 ++i;
8609 continue;
8610 }
8611
8612 /* untranslatable character */
8613 collstart = i;
8614 collend = i+1;
8615
8616 /* find all untranslatable characters */
8617 while (collend < size) {
8618 PyObject *x;
8619 ch = PyUnicode_READ(kind, data, collend);
8620 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008621 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008622 Py_XDECREF(x);
8623 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008625 ++collend;
8626 }
8627
8628 if (ignore) {
8629 i = collend;
8630 }
8631 else {
8632 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8633 reason, input, &exc,
8634 collstart, collend, &newpos);
8635 if (repunicode == NULL)
8636 goto onError;
8637 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008639 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008640 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008641 Py_DECREF(repunicode);
8642 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008643 }
8644 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 Py_XDECREF(exc);
8646 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008647 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008650 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651 Py_XDECREF(exc);
8652 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 return NULL;
8654}
8655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656/* Deprecated. Use PyUnicode_Translate instead. */
8657PyObject *
8658PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8659 Py_ssize_t size,
8660 PyObject *mapping,
8661 const char *errors)
8662{
Christian Heimes5f520f42012-09-11 14:03:25 +02008663 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8665 if (!unicode)
8666 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008667 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8668 Py_DECREF(unicode);
8669 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670}
8671
Alexander Belopolsky40018472011-02-26 01:02:56 +00008672PyObject *
8673PyUnicode_Translate(PyObject *str,
8674 PyObject *mapping,
8675 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676{
8677 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008678
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 str = PyUnicode_FromObject(str);
8680 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008681 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 Py_DECREF(str);
8684 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685}
Tim Petersced69f82003-09-16 20:30:58 +00008686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008688fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689{
8690 /* No need to call PyUnicode_READY(self) because this function is only
8691 called as a callback from fixup() which does it already. */
8692 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8693 const int kind = PyUnicode_KIND(self);
8694 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008695 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008696 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 Py_ssize_t i;
8698
8699 for (i = 0; i < len; ++i) {
8700 ch = PyUnicode_READ(kind, data, i);
8701 fixed = 0;
8702 if (ch > 127) {
8703 if (Py_UNICODE_ISSPACE(ch))
8704 fixed = ' ';
8705 else {
8706 const int decimal = Py_UNICODE_TODECIMAL(ch);
8707 if (decimal >= 0)
8708 fixed = '0' + decimal;
8709 }
8710 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008711 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008712 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 PyUnicode_WRITE(kind, data, i, fixed);
8714 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008715 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008716 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 }
8719
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008720 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008721}
8722
8723PyObject *
8724_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8725{
8726 if (!PyUnicode_Check(unicode)) {
8727 PyErr_BadInternalCall();
8728 return NULL;
8729 }
8730 if (PyUnicode_READY(unicode) == -1)
8731 return NULL;
8732 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8733 /* If the string is already ASCII, just return the same string */
8734 Py_INCREF(unicode);
8735 return unicode;
8736 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008737 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738}
8739
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008740PyObject *
8741PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8742 Py_ssize_t length)
8743{
Victor Stinnerf0124502011-11-21 23:12:56 +01008744 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008745 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008746 Py_UCS4 maxchar;
8747 enum PyUnicode_Kind kind;
8748 void *data;
8749
Victor Stinner99d7ad02012-02-22 13:37:39 +01008750 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008751 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008752 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008753 if (ch > 127) {
8754 int decimal = Py_UNICODE_TODECIMAL(ch);
8755 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008756 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008757 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008758 }
8759 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008760
8761 /* Copy to a new string */
8762 decimal = PyUnicode_New(length, maxchar);
8763 if (decimal == NULL)
8764 return decimal;
8765 kind = PyUnicode_KIND(decimal);
8766 data = PyUnicode_DATA(decimal);
8767 /* Iterate over code points */
8768 for (i = 0; i < length; i++) {
8769 Py_UNICODE ch = s[i];
8770 if (ch > 127) {
8771 int decimal = Py_UNICODE_TODECIMAL(ch);
8772 if (decimal >= 0)
8773 ch = '0' + decimal;
8774 }
8775 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008777 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008778}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008779/* --- Decimal Encoder ---------------------------------------------------- */
8780
Alexander Belopolsky40018472011-02-26 01:02:56 +00008781int
8782PyUnicode_EncodeDecimal(Py_UNICODE *s,
8783 Py_ssize_t length,
8784 char *output,
8785 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008786{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008787 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008788 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008789 enum PyUnicode_Kind kind;
8790 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008791
8792 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 PyErr_BadArgument();
8794 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008795 }
8796
Victor Stinner42bf7752011-11-21 22:52:58 +01008797 unicode = PyUnicode_FromUnicode(s, length);
8798 if (unicode == NULL)
8799 return -1;
8800
Benjamin Petersonbac79492012-01-14 13:34:47 -05008801 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008802 Py_DECREF(unicode);
8803 return -1;
8804 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008805 kind = PyUnicode_KIND(unicode);
8806 data = PyUnicode_DATA(unicode);
8807
Victor Stinnerb84d7232011-11-22 01:50:07 +01008808 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008809 PyObject *exc;
8810 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008812 Py_ssize_t startpos;
8813
8814 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008815
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008817 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008818 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008820 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 decimal = Py_UNICODE_TODECIMAL(ch);
8822 if (decimal >= 0) {
8823 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008824 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 continue;
8826 }
8827 if (0 < ch && ch < 256) {
8828 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008829 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 continue;
8831 }
Victor Stinner6345be92011-11-25 20:09:01 +01008832
Victor Stinner42bf7752011-11-21 22:52:58 +01008833 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008834 exc = NULL;
8835 raise_encode_exception(&exc, "decimal", unicode,
8836 startpos, startpos+1,
8837 "invalid decimal Unicode string");
8838 Py_XDECREF(exc);
8839 Py_DECREF(unicode);
8840 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008841 }
8842 /* 0-terminate the output string */
8843 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008844 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008845 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008846}
8847
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848/* --- Helpers ------------------------------------------------------------ */
8849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008851any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 Py_ssize_t start,
8853 Py_ssize_t end)
8854{
8855 int kind1, kind2, kind;
8856 void *buf1, *buf2;
8857 Py_ssize_t len1, len2, result;
8858
8859 kind1 = PyUnicode_KIND(s1);
8860 kind2 = PyUnicode_KIND(s2);
8861 kind = kind1 > kind2 ? kind1 : kind2;
8862 buf1 = PyUnicode_DATA(s1);
8863 buf2 = PyUnicode_DATA(s2);
8864 if (kind1 != kind)
8865 buf1 = _PyUnicode_AsKind(s1, kind);
8866 if (!buf1)
8867 return -2;
8868 if (kind2 != kind)
8869 buf2 = _PyUnicode_AsKind(s2, kind);
8870 if (!buf2) {
8871 if (kind1 != kind) PyMem_Free(buf1);
8872 return -2;
8873 }
8874 len1 = PyUnicode_GET_LENGTH(s1);
8875 len2 = PyUnicode_GET_LENGTH(s2);
8876
Victor Stinner794d5672011-10-10 03:21:36 +02008877 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008878 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008879 case PyUnicode_1BYTE_KIND:
8880 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8881 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8882 else
8883 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8884 break;
8885 case PyUnicode_2BYTE_KIND:
8886 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8887 break;
8888 case PyUnicode_4BYTE_KIND:
8889 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8890 break;
8891 default:
8892 assert(0); result = -2;
8893 }
8894 }
8895 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008896 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008897 case PyUnicode_1BYTE_KIND:
8898 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8899 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8900 else
8901 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8902 break;
8903 case PyUnicode_2BYTE_KIND:
8904 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8905 break;
8906 case PyUnicode_4BYTE_KIND:
8907 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8908 break;
8909 default:
8910 assert(0); result = -2;
8911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 }
8913
8914 if (kind1 != kind)
8915 PyMem_Free(buf1);
8916 if (kind2 != kind)
8917 PyMem_Free(buf2);
8918
8919 return result;
8920}
8921
8922Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008923_PyUnicode_InsertThousandsGrouping(
8924 PyObject *unicode, Py_ssize_t index,
8925 Py_ssize_t n_buffer,
8926 void *digits, Py_ssize_t n_digits,
8927 Py_ssize_t min_width,
8928 const char *grouping, PyObject *thousands_sep,
8929 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930{
Victor Stinner41a863c2012-02-24 00:37:51 +01008931 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008932 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008933 Py_ssize_t thousands_sep_len;
8934 Py_ssize_t len;
8935
8936 if (unicode != NULL) {
8937 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008938 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008939 }
8940 else {
8941 kind = PyUnicode_1BYTE_KIND;
8942 data = NULL;
8943 }
8944 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8945 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8946 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8947 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008948 if (thousands_sep_kind < kind) {
8949 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8950 if (!thousands_sep_data)
8951 return -1;
8952 }
8953 else {
8954 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8955 if (!data)
8956 return -1;
8957 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008958 }
8959
Benjamin Petersonead6b532011-12-20 17:23:42 -06008960 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008962 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008963 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008964 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008965 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008966 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008967 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008968 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008969 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008970 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008971 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008972 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008974 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008975 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008976 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008977 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008978 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008980 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008981 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008982 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008983 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008984 break;
8985 default:
8986 assert(0);
8987 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008989 if (unicode != NULL && thousands_sep_kind != kind) {
8990 if (thousands_sep_kind < kind)
8991 PyMem_Free(thousands_sep_data);
8992 else
8993 PyMem_Free(data);
8994 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008995 if (unicode == NULL) {
8996 *maxchar = 127;
8997 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008998 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008999 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009000 }
9001 }
9002 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003}
9004
9005
Thomas Wouters477c8d52006-05-27 19:21:47 +00009006/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009007#define ADJUST_INDICES(start, end, len) \
9008 if (end > len) \
9009 end = len; \
9010 else if (end < 0) { \
9011 end += len; \
9012 if (end < 0) \
9013 end = 0; \
9014 } \
9015 if (start < 0) { \
9016 start += len; \
9017 if (start < 0) \
9018 start = 0; \
9019 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009020
Alexander Belopolsky40018472011-02-26 01:02:56 +00009021Py_ssize_t
9022PyUnicode_Count(PyObject *str,
9023 PyObject *substr,
9024 Py_ssize_t start,
9025 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009027 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009028 PyObject* str_obj;
9029 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 int kind1, kind2, kind;
9031 void *buf1 = NULL, *buf2 = NULL;
9032 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009033
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009034 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009035 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009037 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009038 if (!sub_obj) {
9039 Py_DECREF(str_obj);
9040 return -1;
9041 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009042 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009043 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 Py_DECREF(str_obj);
9045 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 }
Tim Petersced69f82003-09-16 20:30:58 +00009047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 kind1 = PyUnicode_KIND(str_obj);
9049 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009050 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009053 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009054 if (kind2 > kind) {
9055 Py_DECREF(sub_obj);
9056 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009057 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009058 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009059 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 if (!buf2)
9062 goto onError;
9063 len1 = PyUnicode_GET_LENGTH(str_obj);
9064 len2 = PyUnicode_GET_LENGTH(sub_obj);
9065
9066 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009067 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009069 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9070 result = asciilib_count(
9071 ((Py_UCS1*)buf1) + start, end - start,
9072 buf2, len2, PY_SSIZE_T_MAX
9073 );
9074 else
9075 result = ucs1lib_count(
9076 ((Py_UCS1*)buf1) + start, end - start,
9077 buf2, len2, PY_SSIZE_T_MAX
9078 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 break;
9080 case PyUnicode_2BYTE_KIND:
9081 result = ucs2lib_count(
9082 ((Py_UCS2*)buf1) + start, end - start,
9083 buf2, len2, PY_SSIZE_T_MAX
9084 );
9085 break;
9086 case PyUnicode_4BYTE_KIND:
9087 result = ucs4lib_count(
9088 ((Py_UCS4*)buf1) + start, end - start,
9089 buf2, len2, PY_SSIZE_T_MAX
9090 );
9091 break;
9092 default:
9093 assert(0); result = 0;
9094 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009095
9096 Py_DECREF(sub_obj);
9097 Py_DECREF(str_obj);
9098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 if (kind2 != kind)
9100 PyMem_Free(buf2);
9101
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103 onError:
9104 Py_DECREF(sub_obj);
9105 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106 if (kind2 != kind && buf2)
9107 PyMem_Free(buf2);
9108 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109}
9110
Alexander Belopolsky40018472011-02-26 01:02:56 +00009111Py_ssize_t
9112PyUnicode_Find(PyObject *str,
9113 PyObject *sub,
9114 Py_ssize_t start,
9115 Py_ssize_t end,
9116 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009118 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009119
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009121 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009123 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009124 if (!sub) {
9125 Py_DECREF(str);
9126 return -2;
9127 }
9128 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9129 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 Py_DECREF(str);
9131 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 }
Tim Petersced69f82003-09-16 20:30:58 +00009133
Victor Stinner794d5672011-10-10 03:21:36 +02009134 result = any_find_slice(direction,
9135 str, sub, start, end
9136 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009137
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009139 Py_DECREF(sub);
9140
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 return result;
9142}
9143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144Py_ssize_t
9145PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9146 Py_ssize_t start, Py_ssize_t end,
9147 int direction)
9148{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009150 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009151 if (PyUnicode_READY(str) == -1)
9152 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009153 if (start < 0 || end < 0) {
9154 PyErr_SetString(PyExc_IndexError, "string index out of range");
9155 return -2;
9156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157 if (end > PyUnicode_GET_LENGTH(str))
9158 end = PyUnicode_GET_LENGTH(str);
9159 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009160 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9161 kind, end-start, ch, direction);
9162 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009164 else
9165 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166}
9167
Alexander Belopolsky40018472011-02-26 01:02:56 +00009168static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009169tailmatch(PyObject *self,
9170 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009171 Py_ssize_t start,
9172 Py_ssize_t end,
9173 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 int kind_self;
9176 int kind_sub;
9177 void *data_self;
9178 void *data_sub;
9179 Py_ssize_t offset;
9180 Py_ssize_t i;
9181 Py_ssize_t end_sub;
9182
9183 if (PyUnicode_READY(self) == -1 ||
9184 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009185 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186
9187 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188 return 1;
9189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9191 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 kind_self = PyUnicode_KIND(self);
9196 data_self = PyUnicode_DATA(self);
9197 kind_sub = PyUnicode_KIND(substring);
9198 data_sub = PyUnicode_DATA(substring);
9199 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9200
9201 if (direction > 0)
9202 offset = end;
9203 else
9204 offset = start;
9205
9206 if (PyUnicode_READ(kind_self, data_self, offset) ==
9207 PyUnicode_READ(kind_sub, data_sub, 0) &&
9208 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9209 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9210 /* If both are of the same kind, memcmp is sufficient */
9211 if (kind_self == kind_sub) {
9212 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009213 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 data_sub,
9215 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009216 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 }
9218 /* otherwise we have to compare each character by first accesing it */
9219 else {
9220 /* We do not need to compare 0 and len(substring)-1 because
9221 the if statement above ensured already that they are equal
9222 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 for (i = 1; i < end_sub; ++i) {
9224 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9225 PyUnicode_READ(kind_sub, data_sub, i))
9226 return 0;
9227 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009228 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230 }
9231
9232 return 0;
9233}
9234
Alexander Belopolsky40018472011-02-26 01:02:56 +00009235Py_ssize_t
9236PyUnicode_Tailmatch(PyObject *str,
9237 PyObject *substr,
9238 Py_ssize_t start,
9239 Py_ssize_t end,
9240 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009242 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009243
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 str = PyUnicode_FromObject(str);
9245 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247 substr = PyUnicode_FromObject(substr);
9248 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009249 Py_DECREF(str);
9250 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251 }
Tim Petersced69f82003-09-16 20:30:58 +00009252
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009253 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 Py_DECREF(str);
9256 Py_DECREF(substr);
9257 return result;
9258}
9259
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260/* Apply fixfct filter to the Unicode object self and return a
9261 reference to the modified object */
9262
Alexander Belopolsky40018472011-02-26 01:02:56 +00009263static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009264fixup(PyObject *self,
9265 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 PyObject *u;
9268 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009269 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009271 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009272 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009273 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009274 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276 /* fix functions return the new maximum character in a string,
9277 if the kind of the resulting unicode object does not change,
9278 everything is fine. Otherwise we need to change the string kind
9279 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009280 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009281
9282 if (maxchar_new == 0) {
9283 /* no changes */;
9284 if (PyUnicode_CheckExact(self)) {
9285 Py_DECREF(u);
9286 Py_INCREF(self);
9287 return self;
9288 }
9289 else
9290 return u;
9291 }
9292
Victor Stinnere6abb482012-05-02 01:15:40 +02009293 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294
Victor Stinnereaab6042011-12-11 22:22:39 +01009295 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009297
9298 /* In case the maximum character changed, we need to
9299 convert the string to the new category. */
9300 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9301 if (v == NULL) {
9302 Py_DECREF(u);
9303 return NULL;
9304 }
9305 if (maxchar_new > maxchar_old) {
9306 /* If the maxchar increased so that the kind changed, not all
9307 characters are representable anymore and we need to fix the
9308 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009309 _PyUnicode_FastCopyCharacters(v, 0,
9310 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009311 maxchar_old = fixfct(v);
9312 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009315 _PyUnicode_FastCopyCharacters(v, 0,
9316 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009318 Py_DECREF(u);
9319 assert(_PyUnicode_CheckConsistency(v, 1));
9320 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321}
9322
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009323static PyObject *
9324ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009326 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9327 char *resdata, *data = PyUnicode_DATA(self);
9328 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009329
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009330 res = PyUnicode_New(len, 127);
9331 if (res == NULL)
9332 return NULL;
9333 resdata = PyUnicode_DATA(res);
9334 if (lower)
9335 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009337 _Py_bytes_upper(resdata, data, len);
9338 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339}
9340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009342handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009344 Py_ssize_t j;
9345 int final_sigma;
9346 Py_UCS4 c;
9347 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009348
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009349 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9350
9351 where ! is a negation and \p{xxx} is a character with property xxx.
9352 */
9353 for (j = i - 1; j >= 0; j--) {
9354 c = PyUnicode_READ(kind, data, j);
9355 if (!_PyUnicode_IsCaseIgnorable(c))
9356 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009358 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9359 if (final_sigma) {
9360 for (j = i + 1; j < length; j++) {
9361 c = PyUnicode_READ(kind, data, j);
9362 if (!_PyUnicode_IsCaseIgnorable(c))
9363 break;
9364 }
9365 final_sigma = j == length || !_PyUnicode_IsCased(c);
9366 }
9367 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368}
9369
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009370static int
9371lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9372 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009374 /* Obscure special case. */
9375 if (c == 0x3A3) {
9376 mapped[0] = handle_capital_sigma(kind, data, length, i);
9377 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009379 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380}
9381
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009382static Py_ssize_t
9383do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009385 Py_ssize_t i, k = 0;
9386 int n_res, j;
9387 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009388
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009389 c = PyUnicode_READ(kind, data, 0);
9390 n_res = _PyUnicode_ToUpperFull(c, mapped);
9391 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009392 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009393 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009395 for (i = 1; i < length; i++) {
9396 c = PyUnicode_READ(kind, data, i);
9397 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9398 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009399 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009400 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009401 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009402 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009403 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404}
9405
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009406static Py_ssize_t
9407do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9408 Py_ssize_t i, k = 0;
9409
9410 for (i = 0; i < length; i++) {
9411 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9412 int n_res, j;
9413 if (Py_UNICODE_ISUPPER(c)) {
9414 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9415 }
9416 else if (Py_UNICODE_ISLOWER(c)) {
9417 n_res = _PyUnicode_ToUpperFull(c, mapped);
9418 }
9419 else {
9420 n_res = 1;
9421 mapped[0] = c;
9422 }
9423 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009424 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009425 res[k++] = mapped[j];
9426 }
9427 }
9428 return k;
9429}
9430
9431static Py_ssize_t
9432do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9433 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009435 Py_ssize_t i, k = 0;
9436
9437 for (i = 0; i < length; i++) {
9438 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9439 int n_res, j;
9440 if (lower)
9441 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9442 else
9443 n_res = _PyUnicode_ToUpperFull(c, mapped);
9444 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009445 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009446 res[k++] = mapped[j];
9447 }
9448 }
9449 return k;
9450}
9451
9452static Py_ssize_t
9453do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9454{
9455 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9456}
9457
9458static Py_ssize_t
9459do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9460{
9461 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9462}
9463
Benjamin Petersone51757f2012-01-12 21:10:29 -05009464static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009465do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9466{
9467 Py_ssize_t i, k = 0;
9468
9469 for (i = 0; i < length; i++) {
9470 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9471 Py_UCS4 mapped[3];
9472 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9473 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009474 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009475 res[k++] = mapped[j];
9476 }
9477 }
9478 return k;
9479}
9480
9481static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009482do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9483{
9484 Py_ssize_t i, k = 0;
9485 int previous_is_cased;
9486
9487 previous_is_cased = 0;
9488 for (i = 0; i < length; i++) {
9489 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9490 Py_UCS4 mapped[3];
9491 int n_res, j;
9492
9493 if (previous_is_cased)
9494 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9495 else
9496 n_res = _PyUnicode_ToTitleFull(c, mapped);
9497
9498 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009499 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009500 res[k++] = mapped[j];
9501 }
9502
9503 previous_is_cased = _PyUnicode_IsCased(c);
9504 }
9505 return k;
9506}
9507
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508static PyObject *
9509case_operation(PyObject *self,
9510 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9511{
9512 PyObject *res = NULL;
9513 Py_ssize_t length, newlength = 0;
9514 int kind, outkind;
9515 void *data, *outdata;
9516 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9517
Benjamin Petersoneea48462012-01-16 14:28:50 -05009518 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009519
9520 kind = PyUnicode_KIND(self);
9521 data = PyUnicode_DATA(self);
9522 length = PyUnicode_GET_LENGTH(self);
9523 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9524 if (tmp == NULL)
9525 return PyErr_NoMemory();
9526 newlength = perform(kind, data, length, tmp, &maxchar);
9527 res = PyUnicode_New(newlength, maxchar);
9528 if (res == NULL)
9529 goto leave;
9530 tmpend = tmp + newlength;
9531 outdata = PyUnicode_DATA(res);
9532 outkind = PyUnicode_KIND(res);
9533 switch (outkind) {
9534 case PyUnicode_1BYTE_KIND:
9535 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9536 break;
9537 case PyUnicode_2BYTE_KIND:
9538 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9539 break;
9540 case PyUnicode_4BYTE_KIND:
9541 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9542 break;
9543 default:
9544 assert(0);
9545 break;
9546 }
9547 leave:
9548 PyMem_FREE(tmp);
9549 return res;
9550}
9551
Tim Peters8ce9f162004-08-27 01:49:32 +00009552PyObject *
9553PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009556 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009558 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009559 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9560 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009561 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009563 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009565 int use_memcpy;
9566 unsigned char *res_data = NULL, *sep_data = NULL;
9567 PyObject *last_obj;
9568 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009570 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009571 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009572 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009573 }
9574
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009575 /* NOTE: the following code can't call back into Python code,
9576 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009577 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009578
Tim Peters05eba1f2004-08-27 21:32:02 +00009579 seqlen = PySequence_Fast_GET_SIZE(fseq);
9580 /* If empty sequence, return u"". */
9581 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009582 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009583 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009584 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009585
Tim Peters05eba1f2004-08-27 21:32:02 +00009586 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009587 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009588 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009589 if (seqlen == 1) {
9590 if (PyUnicode_CheckExact(items[0])) {
9591 res = items[0];
9592 Py_INCREF(res);
9593 Py_DECREF(fseq);
9594 return res;
9595 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009596 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009597 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009598 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009599 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009600 /* Set up sep and seplen */
9601 if (separator == NULL) {
9602 /* fall back to a blank space separator */
9603 sep = PyUnicode_FromOrdinal(' ');
9604 if (!sep)
9605 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009606 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009607 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009608 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009609 else {
9610 if (!PyUnicode_Check(separator)) {
9611 PyErr_Format(PyExc_TypeError,
9612 "separator: expected str instance,"
9613 " %.80s found",
9614 Py_TYPE(separator)->tp_name);
9615 goto onError;
9616 }
9617 if (PyUnicode_READY(separator))
9618 goto onError;
9619 sep = separator;
9620 seplen = PyUnicode_GET_LENGTH(separator);
9621 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9622 /* inc refcount to keep this code path symmetric with the
9623 above case of a blank separator */
9624 Py_INCREF(sep);
9625 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009626 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009627 }
9628
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009629 /* There are at least two things to join, or else we have a subclass
9630 * of str in the sequence.
9631 * Do a pre-pass to figure out the total amount of space we'll
9632 * need (sz), and see whether all argument are strings.
9633 */
9634 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009635#ifdef Py_DEBUG
9636 use_memcpy = 0;
9637#else
9638 use_memcpy = 1;
9639#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009640 for (i = 0; i < seqlen; i++) {
9641 const Py_ssize_t old_sz = sz;
9642 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 if (!PyUnicode_Check(item)) {
9644 PyErr_Format(PyExc_TypeError,
9645 "sequence item %zd: expected str instance,"
9646 " %.80s found",
9647 i, Py_TYPE(item)->tp_name);
9648 goto onError;
9649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 if (PyUnicode_READY(item) == -1)
9651 goto onError;
9652 sz += PyUnicode_GET_LENGTH(item);
9653 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009654 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009655 if (i != 0)
9656 sz += seplen;
9657 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9658 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009660 goto onError;
9661 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009662 if (use_memcpy && last_obj != NULL) {
9663 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9664 use_memcpy = 0;
9665 }
9666 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009667 }
Tim Petersced69f82003-09-16 20:30:58 +00009668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009670 if (res == NULL)
9671 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009672
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009673 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009674#ifdef Py_DEBUG
9675 use_memcpy = 0;
9676#else
9677 if (use_memcpy) {
9678 res_data = PyUnicode_1BYTE_DATA(res);
9679 kind = PyUnicode_KIND(res);
9680 if (seplen != 0)
9681 sep_data = PyUnicode_1BYTE_DATA(sep);
9682 }
9683#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009684 if (use_memcpy) {
9685 for (i = 0; i < seqlen; ++i) {
9686 Py_ssize_t itemlen;
9687 item = items[i];
9688
9689 /* Copy item, and maybe the separator. */
9690 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009691 Py_MEMCPY(res_data,
9692 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009693 kind * seplen);
9694 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009695 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009696
9697 itemlen = PyUnicode_GET_LENGTH(item);
9698 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009699 Py_MEMCPY(res_data,
9700 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009701 kind * itemlen);
9702 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009703 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009704 }
9705 assert(res_data == PyUnicode_1BYTE_DATA(res)
9706 + kind * PyUnicode_GET_LENGTH(res));
9707 }
9708 else {
9709 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9710 Py_ssize_t itemlen;
9711 item = items[i];
9712
9713 /* Copy item, and maybe the separator. */
9714 if (i && seplen != 0) {
9715 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9716 res_offset += seplen;
9717 }
9718
9719 itemlen = PyUnicode_GET_LENGTH(item);
9720 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009721 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009722 res_offset += itemlen;
9723 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009724 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009725 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009726 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009727
Tim Peters05eba1f2004-08-27 21:32:02 +00009728 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009730 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732
Benjamin Peterson29060642009-01-31 22:14:21 +00009733 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009734 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009736 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737 return NULL;
9738}
9739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740#define FILL(kind, data, value, start, length) \
9741 do { \
9742 Py_ssize_t i_ = 0; \
9743 assert(kind != PyUnicode_WCHAR_KIND); \
9744 switch ((kind)) { \
9745 case PyUnicode_1BYTE_KIND: { \
9746 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009747 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 break; \
9749 } \
9750 case PyUnicode_2BYTE_KIND: { \
9751 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9752 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9753 break; \
9754 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009755 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9757 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9758 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009759 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 } \
9761 } \
9762 } while (0)
9763
Victor Stinnerd3f08822012-05-29 12:57:52 +02009764void
9765_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9766 Py_UCS4 fill_char)
9767{
9768 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9769 const void *data = PyUnicode_DATA(unicode);
9770 assert(PyUnicode_IS_READY(unicode));
9771 assert(unicode_modifiable(unicode));
9772 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9773 assert(start >= 0);
9774 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9775 FILL(kind, data, fill_char, start, length);
9776}
9777
Victor Stinner3fe55312012-01-04 00:33:50 +01009778Py_ssize_t
9779PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9780 Py_UCS4 fill_char)
9781{
9782 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009783
9784 if (!PyUnicode_Check(unicode)) {
9785 PyErr_BadInternalCall();
9786 return -1;
9787 }
9788 if (PyUnicode_READY(unicode) == -1)
9789 return -1;
9790 if (unicode_check_modifiable(unicode))
9791 return -1;
9792
Victor Stinnerd3f08822012-05-29 12:57:52 +02009793 if (start < 0) {
9794 PyErr_SetString(PyExc_IndexError, "string index out of range");
9795 return -1;
9796 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009797 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9798 PyErr_SetString(PyExc_ValueError,
9799 "fill character is bigger than "
9800 "the string maximum character");
9801 return -1;
9802 }
9803
9804 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9805 length = Py_MIN(maxlen, length);
9806 if (length <= 0)
9807 return 0;
9808
Victor Stinnerd3f08822012-05-29 12:57:52 +02009809 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009810 return length;
9811}
9812
Victor Stinner9310abb2011-10-05 00:59:23 +02009813static PyObject *
9814pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009815 Py_ssize_t left,
9816 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 PyObject *u;
9820 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009821 int kind;
9822 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823
9824 if (left < 0)
9825 left = 0;
9826 if (right < 0)
9827 right = 0;
9828
Victor Stinnerc4b49542011-12-11 22:44:26 +01009829 if (left == 0 && right == 0)
9830 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9833 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009834 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9835 return NULL;
9836 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009838 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009840 if (!u)
9841 return NULL;
9842
9843 kind = PyUnicode_KIND(u);
9844 data = PyUnicode_DATA(u);
9845 if (left)
9846 FILL(kind, data, fill, 0, left);
9847 if (right)
9848 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009849 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009850 assert(_PyUnicode_CheckConsistency(u, 1));
9851 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852}
9853
Alexander Belopolsky40018472011-02-26 01:02:56 +00009854PyObject *
9855PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858
9859 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009860 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009861 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009862 if (PyUnicode_READY(string) == -1) {
9863 Py_DECREF(string);
9864 return NULL;
9865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866
Benjamin Petersonead6b532011-12-20 17:23:42 -06009867 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009869 if (PyUnicode_IS_ASCII(string))
9870 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009871 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009872 PyUnicode_GET_LENGTH(string), keepends);
9873 else
9874 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009876 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 break;
9878 case PyUnicode_2BYTE_KIND:
9879 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009880 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 PyUnicode_GET_LENGTH(string), keepends);
9882 break;
9883 case PyUnicode_4BYTE_KIND:
9884 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009885 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 PyUnicode_GET_LENGTH(string), keepends);
9887 break;
9888 default:
9889 assert(0);
9890 list = 0;
9891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892 Py_DECREF(string);
9893 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894}
9895
Alexander Belopolsky40018472011-02-26 01:02:56 +00009896static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009897split(PyObject *self,
9898 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009899 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 int kind1, kind2, kind;
9902 void *buf1, *buf2;
9903 Py_ssize_t len1, len2;
9904 PyObject* out;
9905
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009907 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 if (PyUnicode_READY(self) == -1)
9910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009913 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009915 if (PyUnicode_IS_ASCII(self))
9916 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009917 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009918 PyUnicode_GET_LENGTH(self), maxcount
9919 );
9920 else
9921 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009923 PyUnicode_GET_LENGTH(self), maxcount
9924 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 case PyUnicode_2BYTE_KIND:
9926 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 PyUnicode_GET_LENGTH(self), maxcount
9929 );
9930 case PyUnicode_4BYTE_KIND:
9931 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009932 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 PyUnicode_GET_LENGTH(self), maxcount
9934 );
9935 default:
9936 assert(0);
9937 return NULL;
9938 }
9939
9940 if (PyUnicode_READY(substring) == -1)
9941 return NULL;
9942
9943 kind1 = PyUnicode_KIND(self);
9944 kind2 = PyUnicode_KIND(substring);
9945 kind = kind1 > kind2 ? kind1 : kind2;
9946 buf1 = PyUnicode_DATA(self);
9947 buf2 = PyUnicode_DATA(substring);
9948 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009949 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 if (!buf1)
9951 return NULL;
9952 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009953 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 if (!buf2) {
9955 if (kind1 != kind) PyMem_Free(buf1);
9956 return NULL;
9957 }
9958 len1 = PyUnicode_GET_LENGTH(self);
9959 len2 = PyUnicode_GET_LENGTH(substring);
9960
Benjamin Petersonead6b532011-12-20 17:23:42 -06009961 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009963 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9964 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009965 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009966 else
9967 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009968 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009969 break;
9970 case PyUnicode_2BYTE_KIND:
9971 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009972 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 break;
9974 case PyUnicode_4BYTE_KIND:
9975 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009976 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 break;
9978 default:
9979 out = NULL;
9980 }
9981 if (kind1 != kind)
9982 PyMem_Free(buf1);
9983 if (kind2 != kind)
9984 PyMem_Free(buf2);
9985 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986}
9987
Alexander Belopolsky40018472011-02-26 01:02:56 +00009988static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009989rsplit(PyObject *self,
9990 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009991 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 int kind1, kind2, kind;
9994 void *buf1, *buf2;
9995 Py_ssize_t len1, len2;
9996 PyObject* out;
9997
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009998 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009999 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 if (PyUnicode_READY(self) == -1)
10002 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010005 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010007 if (PyUnicode_IS_ASCII(self))
10008 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010009 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010010 PyUnicode_GET_LENGTH(self), maxcount
10011 );
10012 else
10013 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010014 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010015 PyUnicode_GET_LENGTH(self), maxcount
10016 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 case PyUnicode_2BYTE_KIND:
10018 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010019 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 PyUnicode_GET_LENGTH(self), maxcount
10021 );
10022 case PyUnicode_4BYTE_KIND:
10023 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010024 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 PyUnicode_GET_LENGTH(self), maxcount
10026 );
10027 default:
10028 assert(0);
10029 return NULL;
10030 }
10031
10032 if (PyUnicode_READY(substring) == -1)
10033 return NULL;
10034
10035 kind1 = PyUnicode_KIND(self);
10036 kind2 = PyUnicode_KIND(substring);
10037 kind = kind1 > kind2 ? kind1 : kind2;
10038 buf1 = PyUnicode_DATA(self);
10039 buf2 = PyUnicode_DATA(substring);
10040 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010041 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 if (!buf1)
10043 return NULL;
10044 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010045 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 if (!buf2) {
10047 if (kind1 != kind) PyMem_Free(buf1);
10048 return NULL;
10049 }
10050 len1 = PyUnicode_GET_LENGTH(self);
10051 len2 = PyUnicode_GET_LENGTH(substring);
10052
Benjamin Petersonead6b532011-12-20 17:23:42 -060010053 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010055 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10056 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010057 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010058 else
10059 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010060 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 break;
10062 case PyUnicode_2BYTE_KIND:
10063 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010064 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 break;
10066 case PyUnicode_4BYTE_KIND:
10067 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010068 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 break;
10070 default:
10071 out = NULL;
10072 }
10073 if (kind1 != kind)
10074 PyMem_Free(buf1);
10075 if (kind2 != kind)
10076 PyMem_Free(buf2);
10077 return out;
10078}
10079
10080static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010081anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10082 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010084 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010086 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10087 return asciilib_find(buf1, len1, buf2, len2, offset);
10088 else
10089 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 case PyUnicode_2BYTE_KIND:
10091 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10092 case PyUnicode_4BYTE_KIND:
10093 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10094 }
10095 assert(0);
10096 return -1;
10097}
10098
10099static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010100anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10101 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010103 switch (kind) {
10104 case PyUnicode_1BYTE_KIND:
10105 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10106 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10107 else
10108 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10109 case PyUnicode_2BYTE_KIND:
10110 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10111 case PyUnicode_4BYTE_KIND:
10112 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10113 }
10114 assert(0);
10115 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010116}
10117
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010118static void
10119replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10120 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10121{
10122 int kind = PyUnicode_KIND(u);
10123 void *data = PyUnicode_DATA(u);
10124 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10125 if (kind == PyUnicode_1BYTE_KIND) {
10126 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10127 (Py_UCS1 *)data + len,
10128 u1, u2, maxcount);
10129 }
10130 else if (kind == PyUnicode_2BYTE_KIND) {
10131 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10132 (Py_UCS2 *)data + len,
10133 u1, u2, maxcount);
10134 }
10135 else {
10136 assert(kind == PyUnicode_4BYTE_KIND);
10137 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10138 (Py_UCS4 *)data + len,
10139 u1, u2, maxcount);
10140 }
10141}
10142
Alexander Belopolsky40018472011-02-26 01:02:56 +000010143static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144replace(PyObject *self, PyObject *str1,
10145 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 PyObject *u;
10148 char *sbuf = PyUnicode_DATA(self);
10149 char *buf1 = PyUnicode_DATA(str1);
10150 char *buf2 = PyUnicode_DATA(str2);
10151 int srelease = 0, release1 = 0, release2 = 0;
10152 int skind = PyUnicode_KIND(self);
10153 int kind1 = PyUnicode_KIND(str1);
10154 int kind2 = PyUnicode_KIND(str2);
10155 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10156 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10157 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010158 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010159 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160
10161 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010162 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010164 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165
Victor Stinner59de0ee2011-10-07 10:01:28 +020010166 if (str1 == str2)
10167 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168
Victor Stinner49a0a212011-10-12 23:46:10 +020010169 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010170 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10171 if (maxchar < maxchar_str1)
10172 /* substring too wide to be present */
10173 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010174 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10175 /* Replacing str1 with str2 may cause a maxchar reduction in the
10176 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010177 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010178 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010181 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010183 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010185 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010186 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010187 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010188
Victor Stinner69ed0f42013-04-09 21:48:24 +020010189 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010190 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010191 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010192 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010193 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010195 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010197
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010198 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10199 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010200 }
10201 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 int rkind = skind;
10203 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010204 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 if (kind1 < rkind) {
10207 /* widen substring */
10208 buf1 = _PyUnicode_AsKind(str1, rkind);
10209 if (!buf1) goto error;
10210 release1 = 1;
10211 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010213 if (i < 0)
10214 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (rkind > kind2) {
10216 /* widen replacement */
10217 buf2 = _PyUnicode_AsKind(str2, rkind);
10218 if (!buf2) goto error;
10219 release2 = 1;
10220 }
10221 else if (rkind < kind2) {
10222 /* widen self and buf1 */
10223 rkind = kind2;
10224 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010225 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 sbuf = _PyUnicode_AsKind(self, rkind);
10227 if (!sbuf) goto error;
10228 srelease = 1;
10229 buf1 = _PyUnicode_AsKind(str1, rkind);
10230 if (!buf1) goto error;
10231 release1 = 1;
10232 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010233 u = PyUnicode_New(slen, maxchar);
10234 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010236 assert(PyUnicode_KIND(u) == rkind);
10237 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010238
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010239 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010240 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010241 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010243 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010245
10246 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010247 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010248 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010250 if (i == -1)
10251 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010252 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010254 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010258 }
10259 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010261 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 int rkind = skind;
10263 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010266 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 buf1 = _PyUnicode_AsKind(str1, rkind);
10268 if (!buf1) goto error;
10269 release1 = 1;
10270 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010271 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010272 if (n == 0)
10273 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010275 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 buf2 = _PyUnicode_AsKind(str2, rkind);
10277 if (!buf2) goto error;
10278 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010281 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 rkind = kind2;
10283 sbuf = _PyUnicode_AsKind(self, rkind);
10284 if (!sbuf) goto error;
10285 srelease = 1;
10286 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010287 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 buf1 = _PyUnicode_AsKind(str1, rkind);
10289 if (!buf1) goto error;
10290 release1 = 1;
10291 }
10292 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10293 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010294 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 PyErr_SetString(PyExc_OverflowError,
10296 "replace string is too long");
10297 goto error;
10298 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010299 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010300 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010301 _Py_INCREF_UNICODE_EMPTY();
10302 if (!unicode_empty)
10303 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010304 u = unicode_empty;
10305 goto done;
10306 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010307 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 PyErr_SetString(PyExc_OverflowError,
10309 "replace string is too long");
10310 goto error;
10311 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010312 u = PyUnicode_New(new_size, maxchar);
10313 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010315 assert(PyUnicode_KIND(u) == rkind);
10316 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 ires = i = 0;
10318 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 while (n-- > 0) {
10320 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010321 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010322 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010324 if (j == -1)
10325 break;
10326 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010327 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010328 memcpy(res + rkind * ires,
10329 sbuf + rkind * i,
10330 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010332 }
10333 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010335 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010337 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010343 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010344 memcpy(res + rkind * ires,
10345 sbuf + rkind * i,
10346 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010347 }
10348 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010349 /* interleave */
10350 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010351 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010355 if (--n <= 0)
10356 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010357 memcpy(res + rkind * ires,
10358 sbuf + rkind * i,
10359 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 ires++;
10361 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010362 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010363 memcpy(res + rkind * ires,
10364 sbuf + rkind * i,
10365 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010366 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010367 }
10368
10369 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010370 unicode_adjust_maxchar(&u);
10371 if (u == NULL)
10372 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010374
10375 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 if (srelease)
10377 PyMem_FREE(sbuf);
10378 if (release1)
10379 PyMem_FREE(buf1);
10380 if (release2)
10381 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010382 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010384
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010386 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 if (srelease)
10388 PyMem_FREE(sbuf);
10389 if (release1)
10390 PyMem_FREE(buf1);
10391 if (release2)
10392 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010393 return unicode_result_unchanged(self);
10394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 error:
10396 if (srelease && sbuf)
10397 PyMem_FREE(sbuf);
10398 if (release1 && buf1)
10399 PyMem_FREE(buf1);
10400 if (release2 && buf2)
10401 PyMem_FREE(buf2);
10402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403}
10404
10405/* --- Unicode Object Methods --------------------------------------------- */
10406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010407PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010408 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409\n\
10410Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010411characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412
10413static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010414unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010416 if (PyUnicode_READY(self) == -1)
10417 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010418 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419}
10420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010421PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010422 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423\n\
10424Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010425have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426
10427static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010428unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010430 if (PyUnicode_READY(self) == -1)
10431 return NULL;
10432 if (PyUnicode_GET_LENGTH(self) == 0)
10433 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010434 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435}
10436
Benjamin Petersond5890c82012-01-14 13:23:30 -050010437PyDoc_STRVAR(casefold__doc__,
10438 "S.casefold() -> str\n\
10439\n\
10440Return a version of S suitable for caseless comparisons.");
10441
10442static PyObject *
10443unicode_casefold(PyObject *self)
10444{
10445 if (PyUnicode_READY(self) == -1)
10446 return NULL;
10447 if (PyUnicode_IS_ASCII(self))
10448 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010449 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010450}
10451
10452
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010453/* Argument converter. Coerces to a single unicode character */
10454
10455static int
10456convert_uc(PyObject *obj, void *addr)
10457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010459 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010460
Benjamin Peterson14339b62009-01-31 16:36:08 +000010461 uniobj = PyUnicode_FromObject(obj);
10462 if (uniobj == NULL) {
10463 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010464 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010465 return 0;
10466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010468 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010469 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010470 Py_DECREF(uniobj);
10471 return 0;
10472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010474 Py_DECREF(uniobj);
10475 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010476}
10477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010478PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010479 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010481Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010482done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483
10484static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010485unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010487 Py_ssize_t marg, left;
10488 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 Py_UCS4 fillchar = ' ';
10490
Victor Stinnere9a29352011-10-01 02:14:59 +020010491 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493
Benjamin Petersonbac79492012-01-14 13:34:47 -050010494 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 return NULL;
10496
Victor Stinnerc4b49542011-12-11 22:44:26 +010010497 if (PyUnicode_GET_LENGTH(self) >= width)
10498 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499
Victor Stinnerc4b49542011-12-11 22:44:26 +010010500 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501 left = marg / 2 + (marg & width & 1);
10502
Victor Stinner9310abb2011-10-05 00:59:23 +020010503 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504}
10505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506/* This function assumes that str1 and str2 are readied by the caller. */
10507
Marc-André Lemburge5034372000-08-08 08:04:29 +000010508static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010509unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010510{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010511#define COMPARE(TYPE1, TYPE2) \
10512 do { \
10513 TYPE1* p1 = (TYPE1 *)data1; \
10514 TYPE2* p2 = (TYPE2 *)data2; \
10515 TYPE1* end = p1 + len; \
10516 Py_UCS4 c1, c2; \
10517 for (; p1 != end; p1++, p2++) { \
10518 c1 = *p1; \
10519 c2 = *p2; \
10520 if (c1 != c2) \
10521 return (c1 < c2) ? -1 : 1; \
10522 } \
10523 } \
10524 while (0)
10525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 int kind1, kind2;
10527 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010528 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 kind1 = PyUnicode_KIND(str1);
10531 kind2 = PyUnicode_KIND(str2);
10532 data1 = PyUnicode_DATA(str1);
10533 data2 = PyUnicode_DATA(str2);
10534 len1 = PyUnicode_GET_LENGTH(str1);
10535 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010536 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010537
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010538 switch(kind1) {
10539 case PyUnicode_1BYTE_KIND:
10540 {
10541 switch(kind2) {
10542 case PyUnicode_1BYTE_KIND:
10543 {
10544 int cmp = memcmp(data1, data2, len);
10545 /* normalize result of memcmp() into the range [-1; 1] */
10546 if (cmp < 0)
10547 return -1;
10548 if (cmp > 0)
10549 return 1;
10550 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010551 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010552 case PyUnicode_2BYTE_KIND:
10553 COMPARE(Py_UCS1, Py_UCS2);
10554 break;
10555 case PyUnicode_4BYTE_KIND:
10556 COMPARE(Py_UCS1, Py_UCS4);
10557 break;
10558 default:
10559 assert(0);
10560 }
10561 break;
10562 }
10563 case PyUnicode_2BYTE_KIND:
10564 {
10565 switch(kind2) {
10566 case PyUnicode_1BYTE_KIND:
10567 COMPARE(Py_UCS2, Py_UCS1);
10568 break;
10569 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010570 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010571 COMPARE(Py_UCS2, Py_UCS2);
10572 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010573 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010574 case PyUnicode_4BYTE_KIND:
10575 COMPARE(Py_UCS2, Py_UCS4);
10576 break;
10577 default:
10578 assert(0);
10579 }
10580 break;
10581 }
10582 case PyUnicode_4BYTE_KIND:
10583 {
10584 switch(kind2) {
10585 case PyUnicode_1BYTE_KIND:
10586 COMPARE(Py_UCS4, Py_UCS1);
10587 break;
10588 case PyUnicode_2BYTE_KIND:
10589 COMPARE(Py_UCS4, Py_UCS2);
10590 break;
10591 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010592 {
10593#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10594 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10595 /* normalize result of wmemcmp() into the range [-1; 1] */
10596 if (cmp < 0)
10597 return -1;
10598 if (cmp > 0)
10599 return 1;
10600#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010601 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010602#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010603 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010604 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010605 default:
10606 assert(0);
10607 }
10608 break;
10609 }
10610 default:
10611 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010612 }
10613
Victor Stinner770e19e2012-10-04 22:59:45 +020010614 if (len1 == len2)
10615 return 0;
10616 if (len1 < len2)
10617 return -1;
10618 else
10619 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010620
10621#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010622}
10623
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010624Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010625unicode_compare_eq(PyObject *str1, PyObject *str2)
10626{
10627 int kind;
10628 void *data1, *data2;
10629 Py_ssize_t len;
10630 int cmp;
10631
Victor Stinnere5567ad2012-10-23 02:48:49 +020010632 len = PyUnicode_GET_LENGTH(str1);
10633 if (PyUnicode_GET_LENGTH(str2) != len)
10634 return 0;
10635 kind = PyUnicode_KIND(str1);
10636 if (PyUnicode_KIND(str2) != kind)
10637 return 0;
10638 data1 = PyUnicode_DATA(str1);
10639 data2 = PyUnicode_DATA(str2);
10640
10641 cmp = memcmp(data1, data2, len * kind);
10642 return (cmp == 0);
10643}
10644
10645
Alexander Belopolsky40018472011-02-26 01:02:56 +000010646int
10647PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10650 if (PyUnicode_READY(left) == -1 ||
10651 PyUnicode_READY(right) == -1)
10652 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010653
10654 /* a string is equal to itself */
10655 if (left == right)
10656 return 0;
10657
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010658 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010660 PyErr_Format(PyExc_TypeError,
10661 "Can't compare %.100s and %.100s",
10662 left->ob_type->tp_name,
10663 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664 return -1;
10665}
10666
Martin v. Löwis5b222132007-06-10 09:51:05 +000010667int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010668_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10669{
10670 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10671 if (right_str == NULL)
10672 return -1;
10673 return PyUnicode_Compare(left, right_str);
10674}
10675
10676int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010677PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 Py_ssize_t i;
10680 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 Py_UCS4 chr;
10682
Victor Stinner910337b2011-10-03 03:20:16 +020010683 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (PyUnicode_READY(uni) == -1)
10685 return -1;
10686 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010687 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010688 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010689 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010690 size_t len, len2 = strlen(str);
10691 int cmp;
10692
10693 len = Py_MIN(len1, len2);
10694 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010695 if (cmp != 0) {
10696 if (cmp < 0)
10697 return -1;
10698 else
10699 return 1;
10700 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010701 if (len1 > len2)
10702 return 1; /* uni is longer */
10703 if (len2 > len1)
10704 return -1; /* str is longer */
10705 return 0;
10706 }
10707 else {
10708 void *data = PyUnicode_DATA(uni);
10709 /* Compare Unicode string and source character set string */
10710 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10711 if (chr != str[i])
10712 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10713 /* This check keeps Python strings that end in '\0' from comparing equal
10714 to C strings identical up to that point. */
10715 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10716 return 1; /* uni is longer */
10717 if (str[i])
10718 return -1; /* str is longer */
10719 return 0;
10720 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010721}
10722
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010723
Benjamin Peterson29060642009-01-31 22:14:21 +000010724#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010725 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010726
Alexander Belopolsky40018472011-02-26 01:02:56 +000010727PyObject *
10728PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010729{
10730 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010731 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010732
Victor Stinnere5567ad2012-10-23 02:48:49 +020010733 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10734 Py_RETURN_NOTIMPLEMENTED;
10735
10736 if (PyUnicode_READY(left) == -1 ||
10737 PyUnicode_READY(right) == -1)
10738 return NULL;
10739
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010740 if (left == right) {
10741 switch (op) {
10742 case Py_EQ:
10743 case Py_LE:
10744 case Py_GE:
10745 /* a string is equal to itself */
10746 v = Py_True;
10747 break;
10748 case Py_NE:
10749 case Py_LT:
10750 case Py_GT:
10751 v = Py_False;
10752 break;
10753 default:
10754 PyErr_BadArgument();
10755 return NULL;
10756 }
10757 }
10758 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010759 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010760 result ^= (op == Py_NE);
10761 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010762 }
10763 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010764 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010765
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010766 /* Convert the return value to a Boolean */
10767 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010768 case Py_LE:
10769 v = TEST_COND(result <= 0);
10770 break;
10771 case Py_GE:
10772 v = TEST_COND(result >= 0);
10773 break;
10774 case Py_LT:
10775 v = TEST_COND(result == -1);
10776 break;
10777 case Py_GT:
10778 v = TEST_COND(result == 1);
10779 break;
10780 default:
10781 PyErr_BadArgument();
10782 return NULL;
10783 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010784 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010785 Py_INCREF(v);
10786 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010787}
10788
Alexander Belopolsky40018472011-02-26 01:02:56 +000010789int
10790PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010791{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010792 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010793 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 void *buf1, *buf2;
10795 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010796 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010797
10798 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010799 sub = PyUnicode_FromObject(element);
10800 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010801 PyErr_Format(PyExc_TypeError,
10802 "'in <string>' requires string as left operand, not %s",
10803 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010804 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010805 }
10806
Thomas Wouters477c8d52006-05-27 19:21:47 +000010807 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010808 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010809 Py_DECREF(sub);
10810 return -1;
10811 }
10812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 kind1 = PyUnicode_KIND(str);
10814 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 buf1 = PyUnicode_DATA(str);
10816 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010817 if (kind2 != kind1) {
10818 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010819 Py_DECREF(sub);
10820 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010821 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010822 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010823 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 if (!buf2) {
10826 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010827 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 return -1;
10829 }
10830 len1 = PyUnicode_GET_LENGTH(str);
10831 len2 = PyUnicode_GET_LENGTH(sub);
10832
Victor Stinner77282cb2013-04-14 19:22:47 +020010833 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 case PyUnicode_1BYTE_KIND:
10835 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10836 break;
10837 case PyUnicode_2BYTE_KIND:
10838 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10839 break;
10840 case PyUnicode_4BYTE_KIND:
10841 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10842 break;
10843 default:
10844 result = -1;
10845 assert(0);
10846 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010847
10848 Py_DECREF(str);
10849 Py_DECREF(sub);
10850
Victor Stinner77282cb2013-04-14 19:22:47 +020010851 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 PyMem_Free(buf2);
10853
Guido van Rossum403d68b2000-03-13 15:55:09 +000010854 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010855}
10856
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857/* Concat to string or Unicode object giving a new Unicode object. */
10858
Alexander Belopolsky40018472011-02-26 01:02:56 +000010859PyObject *
10860PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010863 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010864 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865
10866 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010869 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010872 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873
10874 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010875 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010879 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010880 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882 }
10883
Victor Stinner488fa492011-12-12 00:01:39 +010010884 u_len = PyUnicode_GET_LENGTH(u);
10885 v_len = PyUnicode_GET_LENGTH(v);
10886 if (u_len > PY_SSIZE_T_MAX - v_len) {
10887 PyErr_SetString(PyExc_OverflowError,
10888 "strings are too large to concat");
10889 goto onError;
10890 }
10891 new_len = u_len + v_len;
10892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010894 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010895 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010898 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010901 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10902 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 Py_DECREF(u);
10904 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010905 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909 Py_XDECREF(u);
10910 Py_XDECREF(v);
10911 return NULL;
10912}
10913
Walter Dörwald1ab83302007-05-18 17:15:44 +000010914void
Victor Stinner23e56682011-10-03 03:54:37 +020010915PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010916{
Victor Stinner23e56682011-10-03 03:54:37 +020010917 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010918 Py_UCS4 maxchar, maxchar2;
10919 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010920
10921 if (p_left == NULL) {
10922 if (!PyErr_Occurred())
10923 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010924 return;
10925 }
Victor Stinner23e56682011-10-03 03:54:37 +020010926 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020010927 if (right == NULL || left == NULL
10928 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010929 if (!PyErr_Occurred())
10930 PyErr_BadInternalCall();
10931 goto error;
10932 }
10933
Benjamin Petersonbac79492012-01-14 13:34:47 -050010934 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010935 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010936 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010937 goto error;
10938
Victor Stinner488fa492011-12-12 00:01:39 +010010939 /* Shortcuts */
10940 if (left == unicode_empty) {
10941 Py_DECREF(left);
10942 Py_INCREF(right);
10943 *p_left = right;
10944 return;
10945 }
10946 if (right == unicode_empty)
10947 return;
10948
10949 left_len = PyUnicode_GET_LENGTH(left);
10950 right_len = PyUnicode_GET_LENGTH(right);
10951 if (left_len > PY_SSIZE_T_MAX - right_len) {
10952 PyErr_SetString(PyExc_OverflowError,
10953 "strings are too large to concat");
10954 goto error;
10955 }
10956 new_len = left_len + right_len;
10957
10958 if (unicode_modifiable(left)
10959 && PyUnicode_CheckExact(right)
10960 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010961 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10962 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010963 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010964 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010965 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10966 {
10967 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010968 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010010969 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020010970
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010971 /* copy 'right' into the newly allocated area of 'left' */
10972 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010973 }
Victor Stinner488fa492011-12-12 00:01:39 +010010974 else {
10975 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10976 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010977 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010978
Victor Stinner488fa492011-12-12 00:01:39 +010010979 /* Concat the two Unicode strings */
10980 res = PyUnicode_New(new_len, maxchar);
10981 if (res == NULL)
10982 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010983 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10984 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010985 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020010986 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010010987 }
10988 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010989 return;
10990
10991error:
Victor Stinner488fa492011-12-12 00:01:39 +010010992 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010993}
10994
10995void
10996PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10997{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010998 PyUnicode_Append(pleft, right);
10999 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011000}
11001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011002PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011005Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011006string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011007interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008
11009static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011010unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011012 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011013 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011014 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 int kind1, kind2, kind;
11017 void *buf1, *buf2;
11018 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019
Jesus Ceaac451502011-04-20 17:09:23 +020011020 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11021 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 kind1 = PyUnicode_KIND(self);
11025 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011026 if (kind2 > kind1) {
11027 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011028 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011029 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011030 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 buf1 = PyUnicode_DATA(self);
11032 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011034 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 if (!buf2) {
11036 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 return NULL;
11038 }
11039 len1 = PyUnicode_GET_LENGTH(self);
11040 len2 = PyUnicode_GET_LENGTH(substring);
11041
11042 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011043 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 case PyUnicode_1BYTE_KIND:
11045 iresult = ucs1lib_count(
11046 ((Py_UCS1*)buf1) + start, end - start,
11047 buf2, len2, PY_SSIZE_T_MAX
11048 );
11049 break;
11050 case PyUnicode_2BYTE_KIND:
11051 iresult = ucs2lib_count(
11052 ((Py_UCS2*)buf1) + start, end - start,
11053 buf2, len2, PY_SSIZE_T_MAX
11054 );
11055 break;
11056 case PyUnicode_4BYTE_KIND:
11057 iresult = ucs4lib_count(
11058 ((Py_UCS4*)buf1) + start, end - start,
11059 buf2, len2, PY_SSIZE_T_MAX
11060 );
11061 break;
11062 default:
11063 assert(0); iresult = 0;
11064 }
11065
11066 result = PyLong_FromSsize_t(iresult);
11067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 if (kind2 != kind)
11069 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070
11071 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011072
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073 return result;
11074}
11075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011076PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011077 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011079Encode S using the codec registered for encoding. Default encoding\n\
11080is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011081handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011082a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11083'xmlcharrefreplace' as well as any other name registered with\n\
11084codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085
11086static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011087unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011089 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090 char *encoding = NULL;
11091 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011092
Benjamin Peterson308d6372009-09-18 21:42:35 +000011093 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11094 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011096 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011097}
11098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011099PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011100 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101\n\
11102Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011103If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104
11105static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011106unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011108 Py_ssize_t i, j, line_pos, src_len, incr;
11109 Py_UCS4 ch;
11110 PyObject *u;
11111 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011112 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011114 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011115 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116
Ezio Melotti745d54d2013-11-16 19:10:57 +020011117 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11118 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120
Antoine Pitrou22425222011-10-04 19:10:51 +020011121 if (PyUnicode_READY(self) == -1)
11122 return NULL;
11123
Thomas Wouters7e474022000-07-16 12:04:32 +000011124 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011125 src_len = PyUnicode_GET_LENGTH(self);
11126 i = j = line_pos = 0;
11127 kind = PyUnicode_KIND(self);
11128 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011129 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011130 for (; i < src_len; i++) {
11131 ch = PyUnicode_READ(kind, src_data, i);
11132 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011133 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011134 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011135 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011136 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011137 goto overflow;
11138 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011140 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011143 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011144 goto overflow;
11145 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011147 if (ch == '\n' || ch == '\r')
11148 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011150 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011151 if (!found)
11152 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011153
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011155 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156 if (!u)
11157 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011158 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159
Antoine Pitroue71d5742011-10-04 15:55:09 +020011160 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
Antoine Pitroue71d5742011-10-04 15:55:09 +020011162 for (; i < src_len; i++) {
11163 ch = PyUnicode_READ(kind, src_data, i);
11164 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011165 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011166 incr = tabsize - (line_pos % tabsize);
11167 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011168 FILL(kind, dest_data, ' ', j, incr);
11169 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011171 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011172 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011173 line_pos++;
11174 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011175 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011176 if (ch == '\n' || ch == '\r')
11177 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011179 }
11180 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011181 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011182
Antoine Pitroue71d5742011-10-04 15:55:09 +020011183 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011184 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186}
11187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011188PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011189 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190\n\
11191Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011192such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193arguments start and end are interpreted as in slice notation.\n\
11194\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011195Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196
11197static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011200 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011201 Py_ssize_t start;
11202 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011203 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204
Jesus Ceaac451502011-04-20 17:09:23 +020011205 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11206 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208
Christian Heimesd47802e2013-06-29 21:33:36 +020011209 if (PyUnicode_READY(self) == -1) {
11210 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011212 }
11213 if (PyUnicode_READY(substring) == -1) {
11214 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217
Victor Stinner7931d9a2011-11-04 00:22:48 +010011218 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219
11220 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 if (result == -2)
11223 return NULL;
11224
Christian Heimes217cfd12007-12-02 14:31:20 +000011225 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226}
11227
11228static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011229unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011231 void *data;
11232 enum PyUnicode_Kind kind;
11233 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011234
11235 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11236 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011238 }
11239 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11240 PyErr_SetString(PyExc_IndexError, "string index out of range");
11241 return NULL;
11242 }
11243 kind = PyUnicode_KIND(self);
11244 data = PyUnicode_DATA(self);
11245 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011246 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247}
11248
Guido van Rossumc2504932007-09-18 19:42:40 +000011249/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011250 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011251static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011252unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253{
Guido van Rossumc2504932007-09-18 19:42:40 +000011254 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011255 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011256
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011257#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011258 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011259#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 if (_PyUnicode_HASH(self) != -1)
11261 return _PyUnicode_HASH(self);
11262 if (PyUnicode_READY(self) == -1)
11263 return -1;
11264 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011265 /*
11266 We make the hash of the empty string be 0, rather than using
11267 (prefix ^ suffix), since this slightly obfuscates the hash secret
11268 */
11269 if (len == 0) {
11270 _PyUnicode_HASH(self) = 0;
11271 return 0;
11272 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011273 x = _Py_HashBytes(PyUnicode_DATA(self),
11274 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011276 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277}
11278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011279PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
11284static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011287 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011288 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011289 Py_ssize_t start;
11290 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
Jesus Ceaac451502011-04-20 17:09:23 +020011292 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11293 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
Christian Heimesd47a0452013-06-29 21:21:37 +020011296 if (PyUnicode_READY(self) == -1) {
11297 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011299 }
11300 if (PyUnicode_READY(substring) == -1) {
11301 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304
Victor Stinner7931d9a2011-11-04 00:22:48 +010011305 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
11307 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 if (result == -2)
11310 return NULL;
11311
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312 if (result < 0) {
11313 PyErr_SetString(PyExc_ValueError, "substring not found");
11314 return NULL;
11315 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011316
Christian Heimes217cfd12007-12-02 14:31:20 +000011317 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318}
11319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011320PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011323Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011324at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
11326static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011327unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 Py_ssize_t i, length;
11330 int kind;
11331 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332 int cased;
11333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 if (PyUnicode_READY(self) == -1)
11335 return NULL;
11336 length = PyUnicode_GET_LENGTH(self);
11337 kind = PyUnicode_KIND(self);
11338 data = PyUnicode_DATA(self);
11339
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 if (length == 1)
11342 return PyBool_FromLong(
11343 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011345 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011348
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 for (i = 0; i < length; i++) {
11351 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011352
Benjamin Peterson29060642009-01-31 22:14:21 +000011353 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11354 return PyBool_FromLong(0);
11355 else if (!cased && Py_UNICODE_ISLOWER(ch))
11356 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011358 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359}
11360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011361PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011364Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011365at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366
11367static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011368unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 Py_ssize_t i, length;
11371 int kind;
11372 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 int cased;
11374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 if (PyUnicode_READY(self) == -1)
11376 return NULL;
11377 length = PyUnicode_GET_LENGTH(self);
11378 kind = PyUnicode_KIND(self);
11379 data = PyUnicode_DATA(self);
11380
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 if (length == 1)
11383 return PyBool_FromLong(
11384 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011386 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011389
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 for (i = 0; i < length; i++) {
11392 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011393
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11395 return PyBool_FromLong(0);
11396 else if (!cased && Py_UNICODE_ISUPPER(ch))
11397 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011399 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400}
11401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011402PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011405Return True if S is a titlecased string and there is at least one\n\
11406character in S, i.e. upper- and titlecase characters may only\n\
11407follow uncased characters and lowercase characters only cased ones.\n\
11408Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409
11410static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011411unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 Py_ssize_t i, length;
11414 int kind;
11415 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 int cased, previous_is_cased;
11417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 if (PyUnicode_READY(self) == -1)
11419 return NULL;
11420 length = PyUnicode_GET_LENGTH(self);
11421 kind = PyUnicode_KIND(self);
11422 data = PyUnicode_DATA(self);
11423
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 if (length == 1) {
11426 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11427 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11428 (Py_UNICODE_ISUPPER(ch) != 0));
11429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011431 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011434
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 cased = 0;
11436 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 for (i = 0; i < length; i++) {
11438 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011439
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11441 if (previous_is_cased)
11442 return PyBool_FromLong(0);
11443 previous_is_cased = 1;
11444 cased = 1;
11445 }
11446 else if (Py_UNICODE_ISLOWER(ch)) {
11447 if (!previous_is_cased)
11448 return PyBool_FromLong(0);
11449 previous_is_cased = 1;
11450 cased = 1;
11451 }
11452 else
11453 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011455 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456}
11457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011458PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011461Return True if all characters in S are whitespace\n\
11462and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
11464static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011465unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 Py_ssize_t i, length;
11468 int kind;
11469 void *data;
11470
11471 if (PyUnicode_READY(self) == -1)
11472 return NULL;
11473 length = PyUnicode_GET_LENGTH(self);
11474 kind = PyUnicode_KIND(self);
11475 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 if (length == 1)
11479 return PyBool_FromLong(
11480 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011482 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 for (i = 0; i < length; i++) {
11487 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011488 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011491 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492}
11493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011494PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011496\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011497Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011498and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011499
11500static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011501unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011502{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 Py_ssize_t i, length;
11504 int kind;
11505 void *data;
11506
11507 if (PyUnicode_READY(self) == -1)
11508 return NULL;
11509 length = PyUnicode_GET_LENGTH(self);
11510 kind = PyUnicode_KIND(self);
11511 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011512
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011513 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 if (length == 1)
11515 return PyBool_FromLong(
11516 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011517
11518 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 for (i = 0; i < length; i++) {
11523 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011525 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011526 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011527}
11528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011529PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011531\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011532Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011533and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011534
11535static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011536unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 int kind;
11539 void *data;
11540 Py_ssize_t len, i;
11541
11542 if (PyUnicode_READY(self) == -1)
11543 return NULL;
11544
11545 kind = PyUnicode_KIND(self);
11546 data = PyUnicode_DATA(self);
11547 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011548
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011549 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 if (len == 1) {
11551 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11552 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11553 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011554
11555 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 for (i = 0; i < len; i++) {
11560 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011561 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011563 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011564 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011565}
11566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011567PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011570Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011571False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572
11573static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011574unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 Py_ssize_t i, length;
11577 int kind;
11578 void *data;
11579
11580 if (PyUnicode_READY(self) == -1)
11581 return NULL;
11582 length = PyUnicode_GET_LENGTH(self);
11583 kind = PyUnicode_KIND(self);
11584 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 if (length == 1)
11588 return PyBool_FromLong(
11589 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011591 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 for (i = 0; i < length; i++) {
11596 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011599 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600}
11601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011602PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011605Return True if all characters in S are digits\n\
11606and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607
11608static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011609unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 Py_ssize_t i, length;
11612 int kind;
11613 void *data;
11614
11615 if (PyUnicode_READY(self) == -1)
11616 return NULL;
11617 length = PyUnicode_GET_LENGTH(self);
11618 kind = PyUnicode_KIND(self);
11619 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 if (length == 1) {
11623 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11624 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011627 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 for (i = 0; i < length; i++) {
11632 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011635 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636}
11637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011638PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011641Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011642False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
11644static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011645unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 Py_ssize_t i, length;
11648 int kind;
11649 void *data;
11650
11651 if (PyUnicode_READY(self) == -1)
11652 return NULL;
11653 length = PyUnicode_GET_LENGTH(self);
11654 kind = PyUnicode_KIND(self);
11655 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 if (length == 1)
11659 return PyBool_FromLong(
11660 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011662 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011664 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 for (i = 0; i < length; i++) {
11667 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011670 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671}
11672
Martin v. Löwis47383402007-08-15 07:32:56 +000011673int
11674PyUnicode_IsIdentifier(PyObject *self)
11675{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 int kind;
11677 void *data;
11678 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011679 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 if (PyUnicode_READY(self) == -1) {
11682 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 }
11685
11686 /* Special case for empty strings */
11687 if (PyUnicode_GET_LENGTH(self) == 0)
11688 return 0;
11689 kind = PyUnicode_KIND(self);
11690 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011691
11692 /* PEP 3131 says that the first character must be in
11693 XID_Start and subsequent characters in XID_Continue,
11694 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011695 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011696 letters, digits, underscore). However, given the current
11697 definition of XID_Start and XID_Continue, it is sufficient
11698 to check just for these, except that _ must be allowed
11699 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011701 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011702 return 0;
11703
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011704 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011707 return 1;
11708}
11709
11710PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011712\n\
11713Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011714to the language definition.\n\
11715\n\
11716Use keyword.iskeyword() to test for reserved identifiers\n\
11717such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011718
11719static PyObject*
11720unicode_isidentifier(PyObject *self)
11721{
11722 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11723}
11724
Georg Brandl559e5d72008-06-11 18:37:52 +000011725PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011727\n\
11728Return True if all characters in S are considered\n\
11729printable in repr() or S is empty, False otherwise.");
11730
11731static PyObject*
11732unicode_isprintable(PyObject *self)
11733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 Py_ssize_t i, length;
11735 int kind;
11736 void *data;
11737
11738 if (PyUnicode_READY(self) == -1)
11739 return NULL;
11740 length = PyUnicode_GET_LENGTH(self);
11741 kind = PyUnicode_KIND(self);
11742 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011743
11744 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (length == 1)
11746 return PyBool_FromLong(
11747 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 for (i = 0; i < length; i++) {
11750 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011751 Py_RETURN_FALSE;
11752 }
11753 }
11754 Py_RETURN_TRUE;
11755}
11756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011758 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759\n\
11760Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011761iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
11763static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011764unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011766 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767}
11768
Martin v. Löwis18e16552006-02-15 17:27:45 +000011769static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011770unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 if (PyUnicode_READY(self) == -1)
11773 return -1;
11774 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775}
11776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011777PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011780Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011781done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782
11783static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011784unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011786 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 Py_UCS4 fillchar = ' ';
11788
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011789 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 return NULL;
11791
Benjamin Petersonbac79492012-01-14 13:34:47 -050011792 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
Victor Stinnerc4b49542011-12-11 22:44:26 +010011795 if (PyUnicode_GET_LENGTH(self) >= width)
11796 return unicode_result_unchanged(self);
11797
11798 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799}
11800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011804Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
11806static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011807unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011809 if (PyUnicode_READY(self) == -1)
11810 return NULL;
11811 if (PyUnicode_IS_ASCII(self))
11812 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011813 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814}
11815
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011816#define LEFTSTRIP 0
11817#define RIGHTSTRIP 1
11818#define BOTHSTRIP 2
11819
11820/* Arrays indexed by above */
11821static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11822
11823#define STRIPNAME(i) (stripformat[i]+3)
11824
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825/* externally visible for str.strip(unicode) */
11826PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011827_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 void *data;
11830 int kind;
11831 Py_ssize_t i, j, len;
11832 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011833 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11836 return NULL;
11837
11838 kind = PyUnicode_KIND(self);
11839 data = PyUnicode_DATA(self);
11840 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011841 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11843 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011844 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011845
Benjamin Peterson14339b62009-01-31 16:36:08 +000011846 i = 0;
11847 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011848 while (i < len) {
11849 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11850 if (!BLOOM(sepmask, ch))
11851 break;
11852 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11853 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 i++;
11855 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011856 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011857
Benjamin Peterson14339b62009-01-31 16:36:08 +000011858 j = len;
11859 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011860 j--;
11861 while (j >= i) {
11862 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11863 if (!BLOOM(sepmask, ch))
11864 break;
11865 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11866 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011868 }
11869
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011871 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011872
Victor Stinner7931d9a2011-11-04 00:22:48 +010011873 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874}
11875
11876PyObject*
11877PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11878{
11879 unsigned char *data;
11880 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011881 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882
Victor Stinnerde636f32011-10-01 03:55:54 +020011883 if (PyUnicode_READY(self) == -1)
11884 return NULL;
11885
Victor Stinner684d5fd2012-05-03 02:32:34 +020011886 length = PyUnicode_GET_LENGTH(self);
11887 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011888
Victor Stinner684d5fd2012-05-03 02:32:34 +020011889 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011890 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891
Victor Stinnerde636f32011-10-01 03:55:54 +020011892 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011893 PyErr_SetString(PyExc_IndexError, "string index out of range");
11894 return NULL;
11895 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011896 if (start >= length || end < start)
11897 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011898
Victor Stinner684d5fd2012-05-03 02:32:34 +020011899 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011900 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011901 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011902 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011903 }
11904 else {
11905 kind = PyUnicode_KIND(self);
11906 data = PyUnicode_1BYTE_DATA(self);
11907 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011908 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011909 length);
11910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
11913static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011914do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 Py_ssize_t len, i, j;
11917
11918 if (PyUnicode_READY(self) == -1)
11919 return NULL;
11920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011922
Victor Stinnercc7af722013-04-09 22:39:24 +020011923 if (PyUnicode_IS_ASCII(self)) {
11924 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11925
11926 i = 0;
11927 if (striptype != RIGHTSTRIP) {
11928 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011929 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020011930 if (!_Py_ascii_whitespace[ch])
11931 break;
11932 i++;
11933 }
11934 }
11935
11936 j = len;
11937 if (striptype != LEFTSTRIP) {
11938 j--;
11939 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020011940 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020011941 if (!_Py_ascii_whitespace[ch])
11942 break;
11943 j--;
11944 }
11945 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011946 }
11947 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011948 else {
11949 int kind = PyUnicode_KIND(self);
11950 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011951
Victor Stinnercc7af722013-04-09 22:39:24 +020011952 i = 0;
11953 if (striptype != RIGHTSTRIP) {
11954 while (i < len) {
11955 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11956 if (!Py_UNICODE_ISSPACE(ch))
11957 break;
11958 i++;
11959 }
Victor Stinner9c79e412013-04-09 22:21:08 +020011960 }
Victor Stinnercc7af722013-04-09 22:39:24 +020011961
11962 j = len;
11963 if (striptype != LEFTSTRIP) {
11964 j--;
11965 while (j >= i) {
11966 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11967 if (!Py_UNICODE_ISSPACE(ch))
11968 break;
11969 j--;
11970 }
11971 j++;
11972 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011973 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011974
Victor Stinner7931d9a2011-11-04 00:22:48 +010011975 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976}
11977
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011978
11979static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011980do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011981{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011982 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011983
Serhiy Storchakac6792272013-10-19 21:03:34 +030011984 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011985 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011986
Benjamin Peterson14339b62009-01-31 16:36:08 +000011987 if (sep != NULL && sep != Py_None) {
11988 if (PyUnicode_Check(sep))
11989 return _PyUnicode_XStrip(self, striptype, sep);
11990 else {
11991 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 "%s arg must be None or str",
11993 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011994 return NULL;
11995 }
11996 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011997
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011999}
12000
12001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012002PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012004\n\
12005Return a copy of the string S with leading and trailing\n\
12006whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012007If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012008
12009static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012010unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012011{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012012 if (PyTuple_GET_SIZE(args) == 0)
12013 return do_strip(self, BOTHSTRIP); /* Common case */
12014 else
12015 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012016}
12017
12018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012019PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012021\n\
12022Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012023If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012024
12025static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012026unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012027{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012028 if (PyTuple_GET_SIZE(args) == 0)
12029 return do_strip(self, LEFTSTRIP); /* Common case */
12030 else
12031 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012032}
12033
12034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012035PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012036 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012037\n\
12038Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012039If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012040
12041static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012042unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012043{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012044 if (PyTuple_GET_SIZE(args) == 0)
12045 return do_strip(self, RIGHTSTRIP); /* Common case */
12046 else
12047 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012048}
12049
12050
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012052unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012054 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056
Serhiy Storchaka05997252013-01-26 12:14:02 +020012057 if (len < 1)
12058 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059
Victor Stinnerc4b49542011-12-11 22:44:26 +010012060 /* no repeat, return original string */
12061 if (len == 1)
12062 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012063
Benjamin Petersonbac79492012-01-14 13:34:47 -050012064 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 return NULL;
12066
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012067 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012068 PyErr_SetString(PyExc_OverflowError,
12069 "repeated string is too long");
12070 return NULL;
12071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012073
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012074 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 if (!u)
12076 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012077 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 if (PyUnicode_GET_LENGTH(str) == 1) {
12080 const int kind = PyUnicode_KIND(str);
12081 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012082 if (kind == PyUnicode_1BYTE_KIND) {
12083 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012084 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012085 }
12086 else if (kind == PyUnicode_2BYTE_KIND) {
12087 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012088 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012089 ucs2[n] = fill_char;
12090 } else {
12091 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12092 assert(kind == PyUnicode_4BYTE_KIND);
12093 for (n = 0; n < len; ++n)
12094 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 }
12097 else {
12098 /* number of characters copied this far */
12099 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012100 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 char *to = (char *) PyUnicode_DATA(u);
12102 Py_MEMCPY(to, PyUnicode_DATA(str),
12103 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 n = (done <= nchars-done) ? done : nchars-done;
12106 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012107 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109 }
12110
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012111 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012112 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113}
12114
Alexander Belopolsky40018472011-02-26 01:02:56 +000012115PyObject *
12116PyUnicode_Replace(PyObject *obj,
12117 PyObject *subobj,
12118 PyObject *replobj,
12119 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120{
12121 PyObject *self;
12122 PyObject *str1;
12123 PyObject *str2;
12124 PyObject *result;
12125
12126 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012127 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012130 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 Py_DECREF(self);
12132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133 }
12134 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012135 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 Py_DECREF(self);
12137 Py_DECREF(str1);
12138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012140 if (PyUnicode_READY(self) == -1 ||
12141 PyUnicode_READY(str1) == -1 ||
12142 PyUnicode_READY(str2) == -1)
12143 result = NULL;
12144 else
12145 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146 Py_DECREF(self);
12147 Py_DECREF(str1);
12148 Py_DECREF(str2);
12149 return result;
12150}
12151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012152PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012153 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154\n\
12155Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012156old replaced by new. If the optional argument count is\n\
12157given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158
12159static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 PyObject *str1;
12163 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012164 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165 PyObject *result;
12166
Martin v. Löwis18e16552006-02-15 17:27:45 +000012167 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012169 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012172 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 return NULL;
12174 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012175 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 Py_DECREF(str1);
12177 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012178 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012179 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12180 result = NULL;
12181 else
12182 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183
12184 Py_DECREF(str1);
12185 Py_DECREF(str2);
12186 return result;
12187}
12188
Alexander Belopolsky40018472011-02-26 01:02:56 +000012189static PyObject *
12190unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012192 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 Py_ssize_t isize;
12194 Py_ssize_t osize, squote, dquote, i, o;
12195 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012196 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012200 return NULL;
12201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 isize = PyUnicode_GET_LENGTH(unicode);
12203 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 /* Compute length of output, quote characters, and
12206 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012207 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 max = 127;
12209 squote = dquote = 0;
12210 ikind = PyUnicode_KIND(unicode);
12211 for (i = 0; i < isize; i++) {
12212 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12213 switch (ch) {
12214 case '\'': squote++; osize++; break;
12215 case '"': dquote++; osize++; break;
12216 case '\\': case '\t': case '\r': case '\n':
12217 osize += 2; break;
12218 default:
12219 /* Fast-path ASCII */
12220 if (ch < ' ' || ch == 0x7f)
12221 osize += 4; /* \xHH */
12222 else if (ch < 0x7f)
12223 osize++;
12224 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12225 osize++;
12226 max = ch > max ? ch : max;
12227 }
12228 else if (ch < 0x100)
12229 osize += 4; /* \xHH */
12230 else if (ch < 0x10000)
12231 osize += 6; /* \uHHHH */
12232 else
12233 osize += 10; /* \uHHHHHHHH */
12234 }
12235 }
12236
12237 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012238 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012240 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 if (dquote)
12242 /* Both squote and dquote present. Use squote,
12243 and escape them */
12244 osize += squote;
12245 else
12246 quote = '"';
12247 }
Victor Stinner55c08782013-04-14 18:45:39 +020012248 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249
12250 repr = PyUnicode_New(osize, max);
12251 if (repr == NULL)
12252 return NULL;
12253 okind = PyUnicode_KIND(repr);
12254 odata = PyUnicode_DATA(repr);
12255
12256 PyUnicode_WRITE(okind, odata, 0, quote);
12257 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012258 if (unchanged) {
12259 _PyUnicode_FastCopyCharacters(repr, 1,
12260 unicode, 0,
12261 isize);
12262 }
12263 else {
12264 for (i = 0, o = 1; i < isize; i++) {
12265 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266
Victor Stinner55c08782013-04-14 18:45:39 +020012267 /* Escape quotes and backslashes */
12268 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012269 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012271 continue;
12272 }
12273
12274 /* Map special whitespace to '\t', \n', '\r' */
12275 if (ch == '\t') {
12276 PyUnicode_WRITE(okind, odata, o++, '\\');
12277 PyUnicode_WRITE(okind, odata, o++, 't');
12278 }
12279 else if (ch == '\n') {
12280 PyUnicode_WRITE(okind, odata, o++, '\\');
12281 PyUnicode_WRITE(okind, odata, o++, 'n');
12282 }
12283 else if (ch == '\r') {
12284 PyUnicode_WRITE(okind, odata, o++, '\\');
12285 PyUnicode_WRITE(okind, odata, o++, 'r');
12286 }
12287
12288 /* Map non-printable US ASCII to '\xhh' */
12289 else if (ch < ' ' || ch == 0x7F) {
12290 PyUnicode_WRITE(okind, odata, o++, '\\');
12291 PyUnicode_WRITE(okind, odata, o++, 'x');
12292 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12293 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12294 }
12295
12296 /* Copy ASCII characters as-is */
12297 else if (ch < 0x7F) {
12298 PyUnicode_WRITE(okind, odata, o++, ch);
12299 }
12300
12301 /* Non-ASCII characters */
12302 else {
12303 /* Map Unicode whitespace and control characters
12304 (categories Z* and C* except ASCII space)
12305 */
12306 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12307 PyUnicode_WRITE(okind, odata, o++, '\\');
12308 /* Map 8-bit characters to '\xhh' */
12309 if (ch <= 0xff) {
12310 PyUnicode_WRITE(okind, odata, o++, 'x');
12311 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12312 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12313 }
12314 /* Map 16-bit characters to '\uxxxx' */
12315 else if (ch <= 0xffff) {
12316 PyUnicode_WRITE(okind, odata, o++, 'u');
12317 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12318 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12319 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12320 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12321 }
12322 /* Map 21-bit characters to '\U00xxxxxx' */
12323 else {
12324 PyUnicode_WRITE(okind, odata, o++, 'U');
12325 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12326 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12327 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12328 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12329 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12330 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12331 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12332 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12333 }
12334 }
12335 /* Copy characters as-is */
12336 else {
12337 PyUnicode_WRITE(okind, odata, o++, ch);
12338 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012339 }
12340 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012343 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012344 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345}
12346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012347PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349\n\
12350Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012351such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352arguments start and end are interpreted as in slice notation.\n\
12353\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012354Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355
12356static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012359 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012360 Py_ssize_t start;
12361 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012362 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363
Jesus Ceaac451502011-04-20 17:09:23 +020012364 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12365 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367
Christian Heimesea71a522013-06-29 21:17:34 +020012368 if (PyUnicode_READY(self) == -1) {
12369 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012371 }
12372 if (PyUnicode_READY(substring) == -1) {
12373 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376
Victor Stinner7931d9a2011-11-04 00:22:48 +010012377 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378
12379 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 if (result == -2)
12382 return NULL;
12383
Christian Heimes217cfd12007-12-02 14:31:20 +000012384 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385}
12386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012387PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012390Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391
12392static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012395 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012396 Py_ssize_t start;
12397 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012398 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399
Jesus Ceaac451502011-04-20 17:09:23 +020012400 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12401 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403
Christian Heimesea71a522013-06-29 21:17:34 +020012404 if (PyUnicode_READY(self) == -1) {
12405 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012407 }
12408 if (PyUnicode_READY(substring) == -1) {
12409 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412
Victor Stinner7931d9a2011-11-04 00:22:48 +010012413 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414
12415 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 if (result == -2)
12418 return NULL;
12419
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420 if (result < 0) {
12421 PyErr_SetString(PyExc_ValueError, "substring not found");
12422 return NULL;
12423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424
Christian Heimes217cfd12007-12-02 14:31:20 +000012425 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426}
12427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012428PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012429 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012431Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012432done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012433
12434static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012435unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012437 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 Py_UCS4 fillchar = ' ';
12439
Victor Stinnere9a29352011-10-01 02:14:59 +020012440 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012442
Benjamin Petersonbac79492012-01-14 13:34:47 -050012443 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444 return NULL;
12445
Victor Stinnerc4b49542011-12-11 22:44:26 +010012446 if (PyUnicode_GET_LENGTH(self) >= width)
12447 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448
Victor Stinnerc4b49542011-12-11 22:44:26 +010012449 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450}
12451
Alexander Belopolsky40018472011-02-26 01:02:56 +000012452PyObject *
12453PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454{
12455 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012456
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457 s = PyUnicode_FromObject(s);
12458 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 if (sep != NULL) {
12461 sep = PyUnicode_FromObject(sep);
12462 if (sep == NULL) {
12463 Py_DECREF(s);
12464 return NULL;
12465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466 }
12467
Victor Stinner9310abb2011-10-05 00:59:23 +020012468 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469
12470 Py_DECREF(s);
12471 Py_XDECREF(sep);
12472 return result;
12473}
12474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012475PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012476 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477\n\
12478Return a list of the words in S, using sep as the\n\
12479delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012480splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012481whitespace string is a separator and empty strings are\n\
12482removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483
12484static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012485unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012487 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012489 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012491 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12492 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493 return NULL;
12494
12495 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012496 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012498 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012500 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501}
12502
Thomas Wouters477c8d52006-05-27 19:21:47 +000012503PyObject *
12504PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12505{
12506 PyObject* str_obj;
12507 PyObject* sep_obj;
12508 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 int kind1, kind2, kind;
12510 void *buf1 = NULL, *buf2 = NULL;
12511 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012512
12513 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012514 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012515 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012516 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012517 if (!sep_obj) {
12518 Py_DECREF(str_obj);
12519 return NULL;
12520 }
12521 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12522 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012523 Py_DECREF(str_obj);
12524 return NULL;
12525 }
12526
Victor Stinner14f8f022011-10-05 20:58:25 +020012527 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012529 kind = Py_MAX(kind1, kind2);
12530 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012532 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 if (!buf1)
12534 goto onError;
12535 buf2 = PyUnicode_DATA(sep_obj);
12536 if (kind2 != kind)
12537 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12538 if (!buf2)
12539 goto onError;
12540 len1 = PyUnicode_GET_LENGTH(str_obj);
12541 len2 = PyUnicode_GET_LENGTH(sep_obj);
12542
Benjamin Petersonead6b532011-12-20 17:23:42 -060012543 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012545 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12546 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12547 else
12548 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 break;
12550 case PyUnicode_2BYTE_KIND:
12551 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12552 break;
12553 case PyUnicode_4BYTE_KIND:
12554 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12555 break;
12556 default:
12557 assert(0);
12558 out = 0;
12559 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012560
12561 Py_DECREF(sep_obj);
12562 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 if (kind1 != kind)
12564 PyMem_Free(buf1);
12565 if (kind2 != kind)
12566 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012567
12568 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 onError:
12570 Py_DECREF(sep_obj);
12571 Py_DECREF(str_obj);
12572 if (kind1 != kind && buf1)
12573 PyMem_Free(buf1);
12574 if (kind2 != kind && buf2)
12575 PyMem_Free(buf2);
12576 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012577}
12578
12579
12580PyObject *
12581PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12582{
12583 PyObject* str_obj;
12584 PyObject* sep_obj;
12585 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 int kind1, kind2, kind;
12587 void *buf1 = NULL, *buf2 = NULL;
12588 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012589
12590 str_obj = PyUnicode_FromObject(str_in);
12591 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012593 sep_obj = PyUnicode_FromObject(sep_in);
12594 if (!sep_obj) {
12595 Py_DECREF(str_obj);
12596 return NULL;
12597 }
12598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 kind1 = PyUnicode_KIND(str_in);
12600 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012601 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 buf1 = PyUnicode_DATA(str_in);
12603 if (kind1 != kind)
12604 buf1 = _PyUnicode_AsKind(str_in, kind);
12605 if (!buf1)
12606 goto onError;
12607 buf2 = PyUnicode_DATA(sep_obj);
12608 if (kind2 != kind)
12609 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12610 if (!buf2)
12611 goto onError;
12612 len1 = PyUnicode_GET_LENGTH(str_obj);
12613 len2 = PyUnicode_GET_LENGTH(sep_obj);
12614
Benjamin Petersonead6b532011-12-20 17:23:42 -060012615 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012617 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12618 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12619 else
12620 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 break;
12622 case PyUnicode_2BYTE_KIND:
12623 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12624 break;
12625 case PyUnicode_4BYTE_KIND:
12626 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12627 break;
12628 default:
12629 assert(0);
12630 out = 0;
12631 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012632
12633 Py_DECREF(sep_obj);
12634 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 if (kind1 != kind)
12636 PyMem_Free(buf1);
12637 if (kind2 != kind)
12638 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012639
12640 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 onError:
12642 Py_DECREF(sep_obj);
12643 Py_DECREF(str_obj);
12644 if (kind1 != kind && buf1)
12645 PyMem_Free(buf1);
12646 if (kind2 != kind && buf2)
12647 PyMem_Free(buf2);
12648 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012649}
12650
12651PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012653\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012654Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012655the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012656found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012657
12658static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012659unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012660{
Victor Stinner9310abb2011-10-05 00:59:23 +020012661 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012662}
12663
12664PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012665 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012666\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012667Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012668the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012669separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012670
12671static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012672unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012673{
Victor Stinner9310abb2011-10-05 00:59:23 +020012674 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012675}
12676
Alexander Belopolsky40018472011-02-26 01:02:56 +000012677PyObject *
12678PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012679{
12680 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012681
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012682 s = PyUnicode_FromObject(s);
12683 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012684 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 if (sep != NULL) {
12686 sep = PyUnicode_FromObject(sep);
12687 if (sep == NULL) {
12688 Py_DECREF(s);
12689 return NULL;
12690 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012691 }
12692
Victor Stinner9310abb2011-10-05 00:59:23 +020012693 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012694
12695 Py_DECREF(s);
12696 Py_XDECREF(sep);
12697 return result;
12698}
12699
12700PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012701 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012702\n\
12703Return a list of the words in S, using sep as the\n\
12704delimiter string, starting at the end of the string and\n\
12705working to the front. If maxsplit is given, at most maxsplit\n\
12706splits are done. If sep is not specified, any whitespace string\n\
12707is a separator.");
12708
12709static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012710unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012711{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012712 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012713 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012714 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012715
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012716 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12717 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012718 return NULL;
12719
12720 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012722 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012723 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012724 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012725 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012726}
12727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012728PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730\n\
12731Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012732Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012733is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734
12735static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012736unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012738 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012739 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012741 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12742 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743 return NULL;
12744
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012745 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746}
12747
12748static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012749PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012751 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752}
12753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012754PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756\n\
12757Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012758and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759
12760static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012761unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012763 if (PyUnicode_READY(self) == -1)
12764 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012765 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766}
12767
Larry Hastings61272b72014-01-07 12:41:53 -080012768/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012769
Larry Hastings31826802013-10-19 00:09:25 -070012770@staticmethod
12771str.maketrans as unicode_maketrans
12772
12773 x: object
12774
12775 y: unicode=NULL
12776
12777 z: unicode=NULL
12778
12779 /
12780
12781Return a translation table usable for str.translate().
12782
12783If there is only one argument, it must be a dictionary mapping Unicode
12784ordinals (integers) or characters to Unicode ordinals, strings or None.
12785Character keys will be then converted to ordinals.
12786If there are two arguments, they must be strings of equal length, and
12787in the resulting dictionary, each character in x will be mapped to the
12788character at the same position in y. If there is a third argument, it
12789must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012790[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012791
12792PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012793"maketrans(x, y=None, z=None, /)\n"
12794"--\n"
12795"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012796"Return a translation table usable for str.translate().\n"
12797"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012798"If there is only one argument, it must be a dictionary mapping Unicode\n"
12799"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12800"Character keys will be then converted to ordinals.\n"
12801"If there are two arguments, they must be strings of equal length, and\n"
12802"in the resulting dictionary, each character in x will be mapped to the\n"
12803"character at the same position in y. If there is a third argument, it\n"
12804"must be a string, whose characters will be mapped to None in the result.");
12805
12806#define UNICODE_MAKETRANS_METHODDEF \
12807 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12808
12809static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012810unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012811
12812static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012813unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012814{
Larry Hastings31826802013-10-19 00:09:25 -070012815 PyObject *return_value = NULL;
12816 PyObject *x;
12817 PyObject *y = NULL;
12818 PyObject *z = NULL;
12819
12820 if (!PyArg_ParseTuple(args,
12821 "O|UU:maketrans",
12822 &x, &y, &z))
12823 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012824 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012825
12826exit:
12827 return return_value;
12828}
12829
12830static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012831unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012832/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012833{
Georg Brandlceee0772007-11-27 23:48:05 +000012834 PyObject *new = NULL, *key, *value;
12835 Py_ssize_t i = 0;
12836 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012837
Georg Brandlceee0772007-11-27 23:48:05 +000012838 new = PyDict_New();
12839 if (!new)
12840 return NULL;
12841 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 int x_kind, y_kind, z_kind;
12843 void *x_data, *y_data, *z_data;
12844
Georg Brandlceee0772007-11-27 23:48:05 +000012845 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012846 if (!PyUnicode_Check(x)) {
12847 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12848 "be a string if there is a second argument");
12849 goto err;
12850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012852 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12853 "arguments must have equal length");
12854 goto err;
12855 }
12856 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857 x_kind = PyUnicode_KIND(x);
12858 y_kind = PyUnicode_KIND(y);
12859 x_data = PyUnicode_DATA(x);
12860 y_data = PyUnicode_DATA(y);
12861 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12862 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012863 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012864 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012865 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012866 if (!value) {
12867 Py_DECREF(key);
12868 goto err;
12869 }
Georg Brandlceee0772007-11-27 23:48:05 +000012870 res = PyDict_SetItem(new, key, value);
12871 Py_DECREF(key);
12872 Py_DECREF(value);
12873 if (res < 0)
12874 goto err;
12875 }
12876 /* create entries for deleting chars in z */
12877 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 z_kind = PyUnicode_KIND(z);
12879 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012880 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012882 if (!key)
12883 goto err;
12884 res = PyDict_SetItem(new, key, Py_None);
12885 Py_DECREF(key);
12886 if (res < 0)
12887 goto err;
12888 }
12889 }
12890 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 int kind;
12892 void *data;
12893
Georg Brandlceee0772007-11-27 23:48:05 +000012894 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012895 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012896 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12897 "to maketrans it must be a dict");
12898 goto err;
12899 }
12900 /* copy entries into the new dict, converting string keys to int keys */
12901 while (PyDict_Next(x, &i, &key, &value)) {
12902 if (PyUnicode_Check(key)) {
12903 /* convert string keys to integer keys */
12904 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012905 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012906 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12907 "table must be of length 1");
12908 goto err;
12909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 kind = PyUnicode_KIND(key);
12911 data = PyUnicode_DATA(key);
12912 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012913 if (!newkey)
12914 goto err;
12915 res = PyDict_SetItem(new, newkey, value);
12916 Py_DECREF(newkey);
12917 if (res < 0)
12918 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012919 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012920 /* just keep integer keys */
12921 if (PyDict_SetItem(new, key, value) < 0)
12922 goto err;
12923 } else {
12924 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12925 "be strings or integers");
12926 goto err;
12927 }
12928 }
12929 }
12930 return new;
12931 err:
12932 Py_DECREF(new);
12933 return NULL;
12934}
12935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012936PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012937 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938\n\
12939Return a copy of the string S, where all characters have been mapped\n\
12940through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012941Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012942Unmapped characters are left untouched. Characters mapped to None\n\
12943are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944
12945static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949}
12950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012951PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012952 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012954Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955
12956static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012957unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012959 if (PyUnicode_READY(self) == -1)
12960 return NULL;
12961 if (PyUnicode_IS_ASCII(self))
12962 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012963 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964}
12965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012966PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012967 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012969Pad a numeric string S with zeros on the left, to fill a field\n\
12970of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971
12972static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012973unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012975 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012976 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012977 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 int kind;
12979 void *data;
12980 Py_UCS4 chr;
12981
Martin v. Löwis18e16552006-02-15 17:27:45 +000012982 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983 return NULL;
12984
Benjamin Petersonbac79492012-01-14 13:34:47 -050012985 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987
Victor Stinnerc4b49542011-12-11 22:44:26 +010012988 if (PyUnicode_GET_LENGTH(self) >= width)
12989 return unicode_result_unchanged(self);
12990
12991 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992
12993 u = pad(self, fill, 0, '0');
12994
Walter Dörwald068325e2002-04-15 13:36:47 +000012995 if (u == NULL)
12996 return NULL;
12997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 kind = PyUnicode_KIND(u);
12999 data = PyUnicode_DATA(u);
13000 chr = PyUnicode_READ(kind, data, fill);
13001
13002 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013003 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 PyUnicode_WRITE(kind, data, 0, chr);
13005 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006 }
13007
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013008 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013009 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013010}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011
13012#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013013static PyObject *
13014unicode__decimal2ascii(PyObject *self)
13015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013017}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018#endif
13019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013020PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013021 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013023Return True if S starts with the specified prefix, False otherwise.\n\
13024With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013025With optional end, stop comparing S at that position.\n\
13026prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027
13028static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013029unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013030 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013032 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013033 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013034 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013035 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013036 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013037
Jesus Ceaac451502011-04-20 17:09:23 +020013038 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013039 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013040 if (PyTuple_Check(subobj)) {
13041 Py_ssize_t i;
13042 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013043 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013044 if (substring == NULL)
13045 return NULL;
13046 result = tailmatch(self, substring, start, end, -1);
13047 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013048 if (result == -1)
13049 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013050 if (result) {
13051 Py_RETURN_TRUE;
13052 }
13053 }
13054 /* nothing matched */
13055 Py_RETURN_FALSE;
13056 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013057 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013058 if (substring == NULL) {
13059 if (PyErr_ExceptionMatches(PyExc_TypeError))
13060 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13061 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013063 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013064 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013066 if (result == -1)
13067 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013068 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069}
13070
13071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013072PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013073 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013075Return True if S ends with the specified suffix, False otherwise.\n\
13076With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013077With optional end, stop comparing S at that position.\n\
13078suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079
13080static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013081unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013082 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013084 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013085 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013086 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013087 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013088 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089
Jesus Ceaac451502011-04-20 17:09:23 +020013090 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013092 if (PyTuple_Check(subobj)) {
13093 Py_ssize_t i;
13094 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013095 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013096 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013097 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013098 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013099 result = tailmatch(self, substring, start, end, +1);
13100 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013101 if (result == -1)
13102 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013103 if (result) {
13104 Py_RETURN_TRUE;
13105 }
13106 }
13107 Py_RETURN_FALSE;
13108 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013109 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013110 if (substring == NULL) {
13111 if (PyErr_ExceptionMatches(PyExc_TypeError))
13112 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13113 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013115 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013116 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013117 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013118 if (result == -1)
13119 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013120 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121}
13122
Victor Stinner202fdca2012-05-07 12:47:02 +020013123Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013124_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013125{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013126 if (!writer->readonly)
13127 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13128 else {
13129 /* Copy-on-write mode: set buffer size to 0 so
13130 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13131 * next write. */
13132 writer->size = 0;
13133 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013134 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13135 writer->data = PyUnicode_DATA(writer->buffer);
13136 writer->kind = PyUnicode_KIND(writer->buffer);
13137}
13138
Victor Stinnerd3f08822012-05-29 12:57:52 +020013139void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013140_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013141{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013142 memset(writer, 0, sizeof(*writer));
13143#ifdef Py_DEBUG
13144 writer->kind = 5; /* invalid kind */
13145#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013146 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013147}
13148
Victor Stinnerd3f08822012-05-29 12:57:52 +020013149int
13150_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13151 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013152{
Victor Stinner6989ba02013-11-18 21:08:39 +010013153#ifdef MS_WINDOWS
13154 /* On Windows, overallocate by 50% is the best factor */
13155# define OVERALLOCATE_FACTOR 2
13156#else
13157 /* On Linux, overallocate by 25% is the best factor */
13158# define OVERALLOCATE_FACTOR 4
13159#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013160 Py_ssize_t newlen;
13161 PyObject *newbuffer;
13162
Victor Stinnerd3f08822012-05-29 12:57:52 +020013163 assert(length > 0);
13164
Victor Stinner202fdca2012-05-07 12:47:02 +020013165 if (length > PY_SSIZE_T_MAX - writer->pos) {
13166 PyErr_NoMemory();
13167 return -1;
13168 }
13169 newlen = writer->pos + length;
13170
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013171 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013172
Victor Stinnerd3f08822012-05-29 12:57:52 +020013173 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013174 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013175 if (writer->overallocate
13176 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13177 /* overallocate to limit the number of realloc() */
13178 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013179 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013180 if (newlen < writer->min_length)
13181 newlen = writer->min_length;
13182
Victor Stinnerd3f08822012-05-29 12:57:52 +020013183 writer->buffer = PyUnicode_New(newlen, maxchar);
13184 if (writer->buffer == NULL)
13185 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013186 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013187 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013188 if (writer->overallocate
13189 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13190 /* overallocate to limit the number of realloc() */
13191 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013192 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013193 if (newlen < writer->min_length)
13194 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013195
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013196 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013197 /* resize + widen */
13198 newbuffer = PyUnicode_New(newlen, maxchar);
13199 if (newbuffer == NULL)
13200 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013201 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13202 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013203 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013204 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013205 }
13206 else {
13207 newbuffer = resize_compact(writer->buffer, newlen);
13208 if (newbuffer == NULL)
13209 return -1;
13210 }
13211 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013212 }
13213 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013214 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013215 newbuffer = PyUnicode_New(writer->size, maxchar);
13216 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013217 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013218 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13219 writer->buffer, 0, writer->pos);
13220 Py_DECREF(writer->buffer);
13221 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013222 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013223 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013224 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013225
13226#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013227}
13228
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013229Py_LOCAL_INLINE(int)
13230_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013231{
13232 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13233 return -1;
13234 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13235 writer->pos++;
13236 return 0;
13237}
13238
13239int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013240_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13241{
13242 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13243}
13244
13245int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013246_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13247{
13248 Py_UCS4 maxchar;
13249 Py_ssize_t len;
13250
13251 if (PyUnicode_READY(str) == -1)
13252 return -1;
13253 len = PyUnicode_GET_LENGTH(str);
13254 if (len == 0)
13255 return 0;
13256 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13257 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013258 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013259 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013260 Py_INCREF(str);
13261 writer->buffer = str;
13262 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013263 writer->pos += len;
13264 return 0;
13265 }
13266 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13267 return -1;
13268 }
13269 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13270 str, 0, len);
13271 writer->pos += len;
13272 return 0;
13273}
13274
Victor Stinnere215d962012-10-06 23:03:36 +020013275int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013276_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13277 Py_ssize_t start, Py_ssize_t end)
13278{
13279 Py_UCS4 maxchar;
13280 Py_ssize_t len;
13281
13282 if (PyUnicode_READY(str) == -1)
13283 return -1;
13284
13285 assert(0 <= start);
13286 assert(end <= PyUnicode_GET_LENGTH(str));
13287 assert(start <= end);
13288
13289 if (end == 0)
13290 return 0;
13291
13292 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13293 return _PyUnicodeWriter_WriteStr(writer, str);
13294
13295 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13296 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13297 else
13298 maxchar = writer->maxchar;
13299 len = end - start;
13300
13301 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13302 return -1;
13303
13304 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13305 str, start, len);
13306 writer->pos += len;
13307 return 0;
13308}
13309
13310int
Victor Stinner4a587072013-11-19 12:54:53 +010013311_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13312 const char *ascii, Py_ssize_t len)
13313{
13314 if (len == -1)
13315 len = strlen(ascii);
13316
13317 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13318
13319 if (writer->buffer == NULL && !writer->overallocate) {
13320 PyObject *str;
13321
13322 str = _PyUnicode_FromASCII(ascii, len);
13323 if (str == NULL)
13324 return -1;
13325
13326 writer->readonly = 1;
13327 writer->buffer = str;
13328 _PyUnicodeWriter_Update(writer);
13329 writer->pos += len;
13330 return 0;
13331 }
13332
13333 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13334 return -1;
13335
13336 switch (writer->kind)
13337 {
13338 case PyUnicode_1BYTE_KIND:
13339 {
13340 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13341 Py_UCS1 *data = writer->data;
13342
13343 Py_MEMCPY(data + writer->pos, str, len);
13344 break;
13345 }
13346 case PyUnicode_2BYTE_KIND:
13347 {
13348 _PyUnicode_CONVERT_BYTES(
13349 Py_UCS1, Py_UCS2,
13350 ascii, ascii + len,
13351 (Py_UCS2 *)writer->data + writer->pos);
13352 break;
13353 }
13354 case PyUnicode_4BYTE_KIND:
13355 {
13356 _PyUnicode_CONVERT_BYTES(
13357 Py_UCS1, Py_UCS4,
13358 ascii, ascii + len,
13359 (Py_UCS4 *)writer->data + writer->pos);
13360 break;
13361 }
13362 default:
13363 assert(0);
13364 }
13365
13366 writer->pos += len;
13367 return 0;
13368}
13369
13370int
13371_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13372 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013373{
13374 Py_UCS4 maxchar;
13375
13376 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13377 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13378 return -1;
13379 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13380 writer->pos += len;
13381 return 0;
13382}
13383
Victor Stinnerd3f08822012-05-29 12:57:52 +020013384PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013385_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013386{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013387 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013388 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013389 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013390 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013391 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013392 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013393 str = writer->buffer;
13394 writer->buffer = NULL;
13395 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13396 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013397 }
13398 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13399 PyObject *newbuffer;
13400 newbuffer = resize_compact(writer->buffer, writer->pos);
13401 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013402 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013403 return NULL;
13404 }
13405 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013406 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013407 str = writer->buffer;
13408 writer->buffer = NULL;
13409 assert(_PyUnicode_CheckConsistency(str, 1));
13410 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013411}
13412
Victor Stinnerd3f08822012-05-29 12:57:52 +020013413void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013414_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013415{
13416 Py_CLEAR(writer->buffer);
13417}
13418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013420
13421PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013423\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013424Return a formatted version of S, using substitutions from args and kwargs.\n\
13425The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013426
Eric Smith27bbca62010-11-04 17:06:58 +000013427PyDoc_STRVAR(format_map__doc__,
13428 "S.format_map(mapping) -> str\n\
13429\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013430Return a formatted version of S, using substitutions from mapping.\n\
13431The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013432
Eric Smith4a7d76d2008-05-30 18:10:19 +000013433static PyObject *
13434unicode__format__(PyObject* self, PyObject* args)
13435{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013436 PyObject *format_spec;
13437 _PyUnicodeWriter writer;
13438 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013439
13440 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13441 return NULL;
13442
Victor Stinnerd3f08822012-05-29 12:57:52 +020013443 if (PyUnicode_READY(self) == -1)
13444 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013445 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013446 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13447 self, format_spec, 0,
13448 PyUnicode_GET_LENGTH(format_spec));
13449 if (ret == -1) {
13450 _PyUnicodeWriter_Dealloc(&writer);
13451 return NULL;
13452 }
13453 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013454}
13455
Eric Smith8c663262007-08-25 02:26:07 +000013456PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013458\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013459Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013460
13461static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013462unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013464 Py_ssize_t size;
13465
13466 /* If it's a compact object, account for base structure +
13467 character data. */
13468 if (PyUnicode_IS_COMPACT_ASCII(v))
13469 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13470 else if (PyUnicode_IS_COMPACT(v))
13471 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013472 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013473 else {
13474 /* If it is a two-block object, account for base object, and
13475 for character block if present. */
13476 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013477 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013478 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013479 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013480 }
13481 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013482 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013483 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013484 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013485 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013486 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013487
13488 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013489}
13490
13491PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013493
13494static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013495unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013496{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013497 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013498 if (!copy)
13499 return NULL;
13500 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013501}
13502
Guido van Rossumd57fd912000-03-10 22:53:23 +000013503static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013504 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013505 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013506 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13507 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013508 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13509 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013510 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013511 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13512 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13513 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013514 {"expandtabs", (PyCFunction) unicode_expandtabs,
13515 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013516 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013517 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013518 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13519 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13520 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013521 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013522 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13523 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13524 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013525 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013526 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013527 {"splitlines", (PyCFunction) unicode_splitlines,
13528 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013529 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013530 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13531 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13532 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13533 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13534 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13535 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13536 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13537 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13538 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13539 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13540 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13541 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13542 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13543 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013544 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013545 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013546 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013547 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013548 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013549 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013550 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013551 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013552#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013553 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013554 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013555#endif
13556
Benjamin Peterson14339b62009-01-31 16:36:08 +000013557 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013558 {NULL, NULL}
13559};
13560
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013561static PyObject *
13562unicode_mod(PyObject *v, PyObject *w)
13563{
Brian Curtindfc80e32011-08-10 20:28:54 -050013564 if (!PyUnicode_Check(v))
13565 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013567}
13568
13569static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013570 0, /*nb_add*/
13571 0, /*nb_subtract*/
13572 0, /*nb_multiply*/
13573 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013574};
13575
Guido van Rossumd57fd912000-03-10 22:53:23 +000013576static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013577 (lenfunc) unicode_length, /* sq_length */
13578 PyUnicode_Concat, /* sq_concat */
13579 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13580 (ssizeargfunc) unicode_getitem, /* sq_item */
13581 0, /* sq_slice */
13582 0, /* sq_ass_item */
13583 0, /* sq_ass_slice */
13584 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013585};
13586
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013587static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013588unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013590 if (PyUnicode_READY(self) == -1)
13591 return NULL;
13592
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013593 if (PyIndex_Check(item)) {
13594 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013595 if (i == -1 && PyErr_Occurred())
13596 return NULL;
13597 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013598 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013599 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013600 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013601 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013602 PyObject *result;
13603 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013604 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013605 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013609 return NULL;
13610 }
13611
13612 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013613 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013615 slicelength == PyUnicode_GET_LENGTH(self)) {
13616 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013617 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013618 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013619 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013620 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013621 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013622 src_kind = PyUnicode_KIND(self);
13623 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013624 if (!PyUnicode_IS_ASCII(self)) {
13625 kind_limit = kind_maxchar_limit(src_kind);
13626 max_char = 0;
13627 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13628 ch = PyUnicode_READ(src_kind, src_data, cur);
13629 if (ch > max_char) {
13630 max_char = ch;
13631 if (max_char >= kind_limit)
13632 break;
13633 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013634 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013635 }
Victor Stinner55c99112011-10-13 01:17:06 +020013636 else
13637 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013638 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013639 if (result == NULL)
13640 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013641 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013642 dest_data = PyUnicode_DATA(result);
13643
13644 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013645 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13646 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013647 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013648 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013649 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013650 } else {
13651 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13652 return NULL;
13653 }
13654}
13655
13656static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013657 (lenfunc)unicode_length, /* mp_length */
13658 (binaryfunc)unicode_subscript, /* mp_subscript */
13659 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013660};
13661
Guido van Rossumd57fd912000-03-10 22:53:23 +000013662
Guido van Rossumd57fd912000-03-10 22:53:23 +000013663/* Helpers for PyUnicode_Format() */
13664
Victor Stinnera47082312012-10-04 02:19:54 +020013665struct unicode_formatter_t {
13666 PyObject *args;
13667 int args_owned;
13668 Py_ssize_t arglen, argidx;
13669 PyObject *dict;
13670
13671 enum PyUnicode_Kind fmtkind;
13672 Py_ssize_t fmtcnt, fmtpos;
13673 void *fmtdata;
13674 PyObject *fmtstr;
13675
13676 _PyUnicodeWriter writer;
13677};
13678
13679struct unicode_format_arg_t {
13680 Py_UCS4 ch;
13681 int flags;
13682 Py_ssize_t width;
13683 int prec;
13684 int sign;
13685};
13686
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013688unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689{
Victor Stinnera47082312012-10-04 02:19:54 +020013690 Py_ssize_t argidx = ctx->argidx;
13691
13692 if (argidx < ctx->arglen) {
13693 ctx->argidx++;
13694 if (ctx->arglen < 0)
13695 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013696 else
Victor Stinnera47082312012-10-04 02:19:54 +020013697 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013698 }
13699 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013700 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013701 return NULL;
13702}
13703
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013704/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013705
Victor Stinnera47082312012-10-04 02:19:54 +020013706/* Format a float into the writer if the writer is not NULL, or into *p_output
13707 otherwise.
13708
13709 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013710static int
Victor Stinnera47082312012-10-04 02:19:54 +020013711formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13712 PyObject **p_output,
13713 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013714{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013715 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013716 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013717 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013718 int prec;
13719 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013720
Guido van Rossumd57fd912000-03-10 22:53:23 +000013721 x = PyFloat_AsDouble(v);
13722 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013723 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013724
Victor Stinnera47082312012-10-04 02:19:54 +020013725 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013728
Victor Stinnera47082312012-10-04 02:19:54 +020013729 if (arg->flags & F_ALT)
13730 dtoa_flags = Py_DTSF_ALT;
13731 else
13732 dtoa_flags = 0;
13733 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013734 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013735 return -1;
13736 len = strlen(p);
13737 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013738 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013739 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013740 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013741 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013742 }
13743 else
13744 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013745 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013746 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747}
13748
Victor Stinnerd0880d52012-04-27 23:40:13 +020013749/* formatlong() emulates the format codes d, u, o, x and X, and
13750 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13751 * Python's regular ints.
13752 * Return value: a new PyUnicodeObject*, or NULL if error.
13753 * The output string is of the form
13754 * "-"? ("0x" | "0X")? digit+
13755 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13756 * set in flags. The case of hex digits will be correct,
13757 * There will be at least prec digits, zero-filled on the left if
13758 * necessary to get that many.
13759 * val object to be converted
13760 * flags bitmask of format flags; only F_ALT is looked at
13761 * prec minimum number of digits; 0-fill on left if needed
13762 * type a character in [duoxX]; u acts the same as d
13763 *
13764 * CAUTION: o, x and X conversions on regular ints can never
13765 * produce a '-' sign, but can for Python's unbounded ints.
13766 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013767static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013768formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013769{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013770 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013771 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013772 Py_ssize_t i;
13773 int sign; /* 1 if '-', else 0 */
13774 int len; /* number of characters */
13775 Py_ssize_t llen;
13776 int numdigits; /* len == numnondigits + numdigits */
13777 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013778 int prec = arg->prec;
13779 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013780
Victor Stinnerd0880d52012-04-27 23:40:13 +020013781 /* Avoid exceeding SSIZE_T_MAX */
13782 if (prec > INT_MAX-3) {
13783 PyErr_SetString(PyExc_OverflowError,
13784 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013785 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013786 }
13787
13788 assert(PyLong_Check(val));
13789
13790 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013791 default:
13792 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013793 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013794 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013795 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013796 /* int and int subclasses should print numerically when a numeric */
13797 /* format code is used (see issue18780) */
13798 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013799 break;
13800 case 'o':
13801 numnondigits = 2;
13802 result = PyNumber_ToBase(val, 8);
13803 break;
13804 case 'x':
13805 case 'X':
13806 numnondigits = 2;
13807 result = PyNumber_ToBase(val, 16);
13808 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013809 }
13810 if (!result)
13811 return NULL;
13812
13813 assert(unicode_modifiable(result));
13814 assert(PyUnicode_IS_READY(result));
13815 assert(PyUnicode_IS_ASCII(result));
13816
13817 /* To modify the string in-place, there can only be one reference. */
13818 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013819 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013820 PyErr_BadInternalCall();
13821 return NULL;
13822 }
13823 buf = PyUnicode_DATA(result);
13824 llen = PyUnicode_GET_LENGTH(result);
13825 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013826 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013827 PyErr_SetString(PyExc_ValueError,
13828 "string too large in _PyBytes_FormatLong");
13829 return NULL;
13830 }
13831 len = (int)llen;
13832 sign = buf[0] == '-';
13833 numnondigits += sign;
13834 numdigits = len - numnondigits;
13835 assert(numdigits > 0);
13836
13837 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013838 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013839 (type == 'o' || type == 'x' || type == 'X'))) {
13840 assert(buf[sign] == '0');
13841 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13842 buf[sign+1] == 'o');
13843 numnondigits -= 2;
13844 buf += 2;
13845 len -= 2;
13846 if (sign)
13847 buf[0] = '-';
13848 assert(len == numnondigits + numdigits);
13849 assert(numdigits > 0);
13850 }
13851
13852 /* Fill with leading zeroes to meet minimum width. */
13853 if (prec > numdigits) {
13854 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13855 numnondigits + prec);
13856 char *b1;
13857 if (!r1) {
13858 Py_DECREF(result);
13859 return NULL;
13860 }
13861 b1 = PyBytes_AS_STRING(r1);
13862 for (i = 0; i < numnondigits; ++i)
13863 *b1++ = *buf++;
13864 for (i = 0; i < prec - numdigits; i++)
13865 *b1++ = '0';
13866 for (i = 0; i < numdigits; i++)
13867 *b1++ = *buf++;
13868 *b1 = '\0';
13869 Py_DECREF(result);
13870 result = r1;
13871 buf = PyBytes_AS_STRING(result);
13872 len = numnondigits + prec;
13873 }
13874
13875 /* Fix up case for hex conversions. */
13876 if (type == 'X') {
13877 /* Need to convert all lower case letters to upper case.
13878 and need to convert 0x to 0X (and -0x to -0X). */
13879 for (i = 0; i < len; i++)
13880 if (buf[i] >= 'a' && buf[i] <= 'x')
13881 buf[i] -= 'a'-'A';
13882 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013883 if (!PyUnicode_Check(result)
13884 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013885 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013886 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013887 Py_DECREF(result);
13888 result = unicode;
13889 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013890 else if (len != PyUnicode_GET_LENGTH(result)) {
13891 if (PyUnicode_Resize(&result, len) < 0)
13892 Py_CLEAR(result);
13893 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013894 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013895}
13896
Ethan Furmandf3ed242014-01-05 06:50:30 -080013897/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020013898 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013899 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013900 * -1 and raise an exception on error */
13901static int
Victor Stinnera47082312012-10-04 02:19:54 +020013902mainformatlong(PyObject *v,
13903 struct unicode_format_arg_t *arg,
13904 PyObject **p_output,
13905 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013906{
13907 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013908 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013909
13910 if (!PyNumber_Check(v))
13911 goto wrongtype;
13912
Ethan Furman9ab74802014-03-21 06:38:46 -070013913 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020013914 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080013915 if (type == 'o' || type == 'x' || type == 'X') {
13916 iobj = PyNumber_Index(v);
13917 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070013918 if (PyErr_ExceptionMatches(PyExc_TypeError))
13919 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070013920 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080013921 }
13922 }
13923 else {
13924 iobj = PyNumber_Long(v);
13925 if (iobj == NULL ) {
13926 if (PyErr_ExceptionMatches(PyExc_TypeError))
13927 goto wrongtype;
13928 return -1;
13929 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013930 }
13931 assert(PyLong_Check(iobj));
13932 }
13933 else {
13934 iobj = v;
13935 Py_INCREF(iobj);
13936 }
13937
13938 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013939 && arg->width == -1 && arg->prec == -1
13940 && !(arg->flags & (F_SIGN | F_BLANK))
13941 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013942 {
13943 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013944 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013945 int base;
13946
Victor Stinnera47082312012-10-04 02:19:54 +020013947 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013948 {
13949 default:
13950 assert(0 && "'type' not in [diuoxX]");
13951 case 'd':
13952 case 'i':
13953 case 'u':
13954 base = 10;
13955 break;
13956 case 'o':
13957 base = 8;
13958 break;
13959 case 'x':
13960 case 'X':
13961 base = 16;
13962 break;
13963 }
13964
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013965 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13966 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013967 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013968 }
13969 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013970 return 1;
13971 }
13972
Victor Stinnera47082312012-10-04 02:19:54 +020013973 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013974 Py_DECREF(iobj);
13975 if (res == NULL)
13976 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013977 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013978 return 0;
13979
13980wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070013981 switch(type)
13982 {
13983 case 'o':
13984 case 'x':
13985 case 'X':
13986 PyErr_Format(PyExc_TypeError,
13987 "%%%c format: an integer is required, "
13988 "not %.200s",
13989 type, Py_TYPE(v)->tp_name);
13990 break;
13991 default:
13992 PyErr_Format(PyExc_TypeError,
13993 "%%%c format: a number is required, "
13994 "not %.200s",
13995 type, Py_TYPE(v)->tp_name);
13996 break;
13997 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013998 return -1;
13999}
14000
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014001static Py_UCS4
14002formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014003{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014004 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014005 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014006 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014007 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014008 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014009 goto onError;
14010 }
14011 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014012 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014013 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014014 /* make sure number is a type of integer */
14015 if (!PyLong_Check(v)) {
14016 iobj = PyNumber_Index(v);
14017 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014018 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014019 }
14020 v = iobj;
14021 Py_DECREF(iobj);
14022 }
14023 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014024 x = PyLong_AsLong(v);
14025 if (x == -1 && PyErr_Occurred())
14026 goto onError;
14027
Victor Stinner8faf8212011-12-08 22:14:11 +010014028 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014029 PyErr_SetString(PyExc_OverflowError,
14030 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014031 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014032 }
14033
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014034 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014035 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014036
Benjamin Peterson29060642009-01-31 22:14:21 +000014037 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014038 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014039 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014040 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014041}
14042
Victor Stinnera47082312012-10-04 02:19:54 +020014043/* Parse options of an argument: flags, width, precision.
14044 Handle also "%(name)" syntax.
14045
14046 Return 0 if the argument has been formatted into arg->str.
14047 Return 1 if the argument has been written into ctx->writer,
14048 Raise an exception and return -1 on error. */
14049static int
14050unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14051 struct unicode_format_arg_t *arg)
14052{
14053#define FORMAT_READ(ctx) \
14054 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14055
14056 PyObject *v;
14057
Victor Stinnera47082312012-10-04 02:19:54 +020014058 if (arg->ch == '(') {
14059 /* Get argument value from a dictionary. Example: "%(name)s". */
14060 Py_ssize_t keystart;
14061 Py_ssize_t keylen;
14062 PyObject *key;
14063 int pcount = 1;
14064
14065 if (ctx->dict == NULL) {
14066 PyErr_SetString(PyExc_TypeError,
14067 "format requires a mapping");
14068 return -1;
14069 }
14070 ++ctx->fmtpos;
14071 --ctx->fmtcnt;
14072 keystart = ctx->fmtpos;
14073 /* Skip over balanced parentheses */
14074 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14075 arg->ch = FORMAT_READ(ctx);
14076 if (arg->ch == ')')
14077 --pcount;
14078 else if (arg->ch == '(')
14079 ++pcount;
14080 ctx->fmtpos++;
14081 }
14082 keylen = ctx->fmtpos - keystart - 1;
14083 if (ctx->fmtcnt < 0 || pcount > 0) {
14084 PyErr_SetString(PyExc_ValueError,
14085 "incomplete format key");
14086 return -1;
14087 }
14088 key = PyUnicode_Substring(ctx->fmtstr,
14089 keystart, keystart + keylen);
14090 if (key == NULL)
14091 return -1;
14092 if (ctx->args_owned) {
14093 Py_DECREF(ctx->args);
14094 ctx->args_owned = 0;
14095 }
14096 ctx->args = PyObject_GetItem(ctx->dict, key);
14097 Py_DECREF(key);
14098 if (ctx->args == NULL)
14099 return -1;
14100 ctx->args_owned = 1;
14101 ctx->arglen = -1;
14102 ctx->argidx = -2;
14103 }
14104
14105 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014106 while (--ctx->fmtcnt >= 0) {
14107 arg->ch = FORMAT_READ(ctx);
14108 ctx->fmtpos++;
14109 switch (arg->ch) {
14110 case '-': arg->flags |= F_LJUST; continue;
14111 case '+': arg->flags |= F_SIGN; continue;
14112 case ' ': arg->flags |= F_BLANK; continue;
14113 case '#': arg->flags |= F_ALT; continue;
14114 case '0': arg->flags |= F_ZERO; continue;
14115 }
14116 break;
14117 }
14118
14119 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014120 if (arg->ch == '*') {
14121 v = unicode_format_getnextarg(ctx);
14122 if (v == NULL)
14123 return -1;
14124 if (!PyLong_Check(v)) {
14125 PyErr_SetString(PyExc_TypeError,
14126 "* wants int");
14127 return -1;
14128 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014129 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014130 if (arg->width == -1 && PyErr_Occurred())
14131 return -1;
14132 if (arg->width < 0) {
14133 arg->flags |= F_LJUST;
14134 arg->width = -arg->width;
14135 }
14136 if (--ctx->fmtcnt >= 0) {
14137 arg->ch = FORMAT_READ(ctx);
14138 ctx->fmtpos++;
14139 }
14140 }
14141 else if (arg->ch >= '0' && arg->ch <= '9') {
14142 arg->width = arg->ch - '0';
14143 while (--ctx->fmtcnt >= 0) {
14144 arg->ch = FORMAT_READ(ctx);
14145 ctx->fmtpos++;
14146 if (arg->ch < '0' || arg->ch > '9')
14147 break;
14148 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14149 mixing signed and unsigned comparison. Since arg->ch is between
14150 '0' and '9', casting to int is safe. */
14151 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14152 PyErr_SetString(PyExc_ValueError,
14153 "width too big");
14154 return -1;
14155 }
14156 arg->width = arg->width*10 + (arg->ch - '0');
14157 }
14158 }
14159
14160 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014161 if (arg->ch == '.') {
14162 arg->prec = 0;
14163 if (--ctx->fmtcnt >= 0) {
14164 arg->ch = FORMAT_READ(ctx);
14165 ctx->fmtpos++;
14166 }
14167 if (arg->ch == '*') {
14168 v = unicode_format_getnextarg(ctx);
14169 if (v == NULL)
14170 return -1;
14171 if (!PyLong_Check(v)) {
14172 PyErr_SetString(PyExc_TypeError,
14173 "* wants int");
14174 return -1;
14175 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014176 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014177 if (arg->prec == -1 && PyErr_Occurred())
14178 return -1;
14179 if (arg->prec < 0)
14180 arg->prec = 0;
14181 if (--ctx->fmtcnt >= 0) {
14182 arg->ch = FORMAT_READ(ctx);
14183 ctx->fmtpos++;
14184 }
14185 }
14186 else if (arg->ch >= '0' && arg->ch <= '9') {
14187 arg->prec = arg->ch - '0';
14188 while (--ctx->fmtcnt >= 0) {
14189 arg->ch = FORMAT_READ(ctx);
14190 ctx->fmtpos++;
14191 if (arg->ch < '0' || arg->ch > '9')
14192 break;
14193 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14194 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014195 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014196 return -1;
14197 }
14198 arg->prec = arg->prec*10 + (arg->ch - '0');
14199 }
14200 }
14201 }
14202
14203 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14204 if (ctx->fmtcnt >= 0) {
14205 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14206 if (--ctx->fmtcnt >= 0) {
14207 arg->ch = FORMAT_READ(ctx);
14208 ctx->fmtpos++;
14209 }
14210 }
14211 }
14212 if (ctx->fmtcnt < 0) {
14213 PyErr_SetString(PyExc_ValueError,
14214 "incomplete format");
14215 return -1;
14216 }
14217 return 0;
14218
14219#undef FORMAT_READ
14220}
14221
14222/* Format one argument. Supported conversion specifiers:
14223
14224 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014225 - "i", "d", "u": int or float
14226 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014227 - "e", "E", "f", "F", "g", "G": float
14228 - "c": int or str (1 character)
14229
Victor Stinner8dbd4212012-12-04 09:30:24 +010014230 When possible, the output is written directly into the Unicode writer
14231 (ctx->writer). A string is created when padding is required.
14232
Victor Stinnera47082312012-10-04 02:19:54 +020014233 Return 0 if the argument has been formatted into *p_str,
14234 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014235 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014236static int
14237unicode_format_arg_format(struct unicode_formatter_t *ctx,
14238 struct unicode_format_arg_t *arg,
14239 PyObject **p_str)
14240{
14241 PyObject *v;
14242 _PyUnicodeWriter *writer = &ctx->writer;
14243
14244 if (ctx->fmtcnt == 0)
14245 ctx->writer.overallocate = 0;
14246
14247 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014248 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014249 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014250 return 1;
14251 }
14252
14253 v = unicode_format_getnextarg(ctx);
14254 if (v == NULL)
14255 return -1;
14256
Victor Stinnera47082312012-10-04 02:19:54 +020014257
14258 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014259 case 's':
14260 case 'r':
14261 case 'a':
14262 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14263 /* Fast path */
14264 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14265 return -1;
14266 return 1;
14267 }
14268
14269 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14270 *p_str = v;
14271 Py_INCREF(*p_str);
14272 }
14273 else {
14274 if (arg->ch == 's')
14275 *p_str = PyObject_Str(v);
14276 else if (arg->ch == 'r')
14277 *p_str = PyObject_Repr(v);
14278 else
14279 *p_str = PyObject_ASCII(v);
14280 }
14281 break;
14282
14283 case 'i':
14284 case 'd':
14285 case 'u':
14286 case 'o':
14287 case 'x':
14288 case 'X':
14289 {
14290 int ret = mainformatlong(v, arg, p_str, writer);
14291 if (ret != 0)
14292 return ret;
14293 arg->sign = 1;
14294 break;
14295 }
14296
14297 case 'e':
14298 case 'E':
14299 case 'f':
14300 case 'F':
14301 case 'g':
14302 case 'G':
14303 if (arg->width == -1 && arg->prec == -1
14304 && !(arg->flags & (F_SIGN | F_BLANK)))
14305 {
14306 /* Fast path */
14307 if (formatfloat(v, arg, NULL, writer) == -1)
14308 return -1;
14309 return 1;
14310 }
14311
14312 arg->sign = 1;
14313 if (formatfloat(v, arg, p_str, NULL) == -1)
14314 return -1;
14315 break;
14316
14317 case 'c':
14318 {
14319 Py_UCS4 ch = formatchar(v);
14320 if (ch == (Py_UCS4) -1)
14321 return -1;
14322 if (arg->width == -1 && arg->prec == -1) {
14323 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014324 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014325 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014326 return 1;
14327 }
14328 *p_str = PyUnicode_FromOrdinal(ch);
14329 break;
14330 }
14331
14332 default:
14333 PyErr_Format(PyExc_ValueError,
14334 "unsupported format character '%c' (0x%x) "
14335 "at index %zd",
14336 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14337 (int)arg->ch,
14338 ctx->fmtpos - 1);
14339 return -1;
14340 }
14341 if (*p_str == NULL)
14342 return -1;
14343 assert (PyUnicode_Check(*p_str));
14344 return 0;
14345}
14346
14347static int
14348unicode_format_arg_output(struct unicode_formatter_t *ctx,
14349 struct unicode_format_arg_t *arg,
14350 PyObject *str)
14351{
14352 Py_ssize_t len;
14353 enum PyUnicode_Kind kind;
14354 void *pbuf;
14355 Py_ssize_t pindex;
14356 Py_UCS4 signchar;
14357 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014358 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014359 Py_ssize_t sublen;
14360 _PyUnicodeWriter *writer = &ctx->writer;
14361 Py_UCS4 fill;
14362
14363 fill = ' ';
14364 if (arg->sign && arg->flags & F_ZERO)
14365 fill = '0';
14366
14367 if (PyUnicode_READY(str) == -1)
14368 return -1;
14369
14370 len = PyUnicode_GET_LENGTH(str);
14371 if ((arg->width == -1 || arg->width <= len)
14372 && (arg->prec == -1 || arg->prec >= len)
14373 && !(arg->flags & (F_SIGN | F_BLANK)))
14374 {
14375 /* Fast path */
14376 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14377 return -1;
14378 return 0;
14379 }
14380
14381 /* Truncate the string for "s", "r" and "a" formats
14382 if the precision is set */
14383 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14384 if (arg->prec >= 0 && len > arg->prec)
14385 len = arg->prec;
14386 }
14387
14388 /* Adjust sign and width */
14389 kind = PyUnicode_KIND(str);
14390 pbuf = PyUnicode_DATA(str);
14391 pindex = 0;
14392 signchar = '\0';
14393 if (arg->sign) {
14394 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14395 if (ch == '-' || ch == '+') {
14396 signchar = ch;
14397 len--;
14398 pindex++;
14399 }
14400 else if (arg->flags & F_SIGN)
14401 signchar = '+';
14402 else if (arg->flags & F_BLANK)
14403 signchar = ' ';
14404 else
14405 arg->sign = 0;
14406 }
14407 if (arg->width < len)
14408 arg->width = len;
14409
14410 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014411 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014412 if (!(arg->flags & F_LJUST)) {
14413 if (arg->sign) {
14414 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014415 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014416 }
14417 else {
14418 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014419 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014420 }
14421 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014422 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14423 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014424 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014425 }
14426
Victor Stinnera47082312012-10-04 02:19:54 +020014427 buflen = arg->width;
14428 if (arg->sign && len == arg->width)
14429 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014430 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014431 return -1;
14432
14433 /* Write the sign if needed */
14434 if (arg->sign) {
14435 if (fill != ' ') {
14436 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14437 writer->pos += 1;
14438 }
14439 if (arg->width > len)
14440 arg->width--;
14441 }
14442
14443 /* Write the numeric prefix for "x", "X" and "o" formats
14444 if the alternate form is used.
14445 For example, write "0x" for the "%#x" format. */
14446 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14447 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14448 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14449 if (fill != ' ') {
14450 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14451 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14452 writer->pos += 2;
14453 pindex += 2;
14454 }
14455 arg->width -= 2;
14456 if (arg->width < 0)
14457 arg->width = 0;
14458 len -= 2;
14459 }
14460
14461 /* Pad left with the fill character if needed */
14462 if (arg->width > len && !(arg->flags & F_LJUST)) {
14463 sublen = arg->width - len;
14464 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14465 writer->pos += sublen;
14466 arg->width = len;
14467 }
14468
14469 /* If padding with spaces: write sign if needed and/or numeric prefix if
14470 the alternate form is used */
14471 if (fill == ' ') {
14472 if (arg->sign) {
14473 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14474 writer->pos += 1;
14475 }
14476 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14477 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14478 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14479 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14480 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14481 writer->pos += 2;
14482 pindex += 2;
14483 }
14484 }
14485
14486 /* Write characters */
14487 if (len) {
14488 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14489 str, pindex, len);
14490 writer->pos += len;
14491 }
14492
14493 /* Pad right with the fill character if needed */
14494 if (arg->width > len) {
14495 sublen = arg->width - len;
14496 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14497 writer->pos += sublen;
14498 }
14499 return 0;
14500}
14501
14502/* Helper of PyUnicode_Format(): format one arg.
14503 Return 0 on success, raise an exception and return -1 on error. */
14504static int
14505unicode_format_arg(struct unicode_formatter_t *ctx)
14506{
14507 struct unicode_format_arg_t arg;
14508 PyObject *str;
14509 int ret;
14510
Victor Stinner8dbd4212012-12-04 09:30:24 +010014511 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14512 arg.flags = 0;
14513 arg.width = -1;
14514 arg.prec = -1;
14515 arg.sign = 0;
14516 str = NULL;
14517
Victor Stinnera47082312012-10-04 02:19:54 +020014518 ret = unicode_format_arg_parse(ctx, &arg);
14519 if (ret == -1)
14520 return -1;
14521
14522 ret = unicode_format_arg_format(ctx, &arg, &str);
14523 if (ret == -1)
14524 return -1;
14525
14526 if (ret != 1) {
14527 ret = unicode_format_arg_output(ctx, &arg, str);
14528 Py_DECREF(str);
14529 if (ret == -1)
14530 return -1;
14531 }
14532
14533 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14534 PyErr_SetString(PyExc_TypeError,
14535 "not all arguments converted during string formatting");
14536 return -1;
14537 }
14538 return 0;
14539}
14540
Alexander Belopolsky40018472011-02-26 01:02:56 +000014541PyObject *
14542PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014543{
Victor Stinnera47082312012-10-04 02:19:54 +020014544 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014545
Guido van Rossumd57fd912000-03-10 22:53:23 +000014546 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014547 PyErr_BadInternalCall();
14548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014549 }
Victor Stinnera47082312012-10-04 02:19:54 +020014550
14551 ctx.fmtstr = PyUnicode_FromObject(format);
14552 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014553 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014554 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14555 Py_DECREF(ctx.fmtstr);
14556 return NULL;
14557 }
14558 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14559 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14560 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14561 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014562
Victor Stinner8f674cc2013-04-17 23:02:17 +020014563 _PyUnicodeWriter_Init(&ctx.writer);
14564 ctx.writer.min_length = ctx.fmtcnt + 100;
14565 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014566
Guido van Rossumd57fd912000-03-10 22:53:23 +000014567 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014568 ctx.arglen = PyTuple_Size(args);
14569 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014570 }
14571 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014572 ctx.arglen = -1;
14573 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014574 }
Victor Stinnera47082312012-10-04 02:19:54 +020014575 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014576 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014577 ctx.dict = args;
14578 else
14579 ctx.dict = NULL;
14580 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014581
Victor Stinnera47082312012-10-04 02:19:54 +020014582 while (--ctx.fmtcnt >= 0) {
14583 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014584 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014585
14586 nonfmtpos = ctx.fmtpos++;
14587 while (ctx.fmtcnt >= 0 &&
14588 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14589 ctx.fmtpos++;
14590 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014591 }
Victor Stinnera47082312012-10-04 02:19:54 +020014592 if (ctx.fmtcnt < 0) {
14593 ctx.fmtpos--;
14594 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014595 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014596
Victor Stinnercfc4c132013-04-03 01:48:39 +020014597 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14598 nonfmtpos, ctx.fmtpos) < 0)
14599 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014600 }
14601 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014602 ctx.fmtpos++;
14603 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014604 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014605 }
14606 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014607
Victor Stinnera47082312012-10-04 02:19:54 +020014608 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014609 PyErr_SetString(PyExc_TypeError,
14610 "not all arguments converted during string formatting");
14611 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014612 }
14613
Victor Stinnera47082312012-10-04 02:19:54 +020014614 if (ctx.args_owned) {
14615 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014616 }
Victor Stinnera47082312012-10-04 02:19:54 +020014617 Py_DECREF(ctx.fmtstr);
14618 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014619
Benjamin Peterson29060642009-01-31 22:14:21 +000014620 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014621 Py_DECREF(ctx.fmtstr);
14622 _PyUnicodeWriter_Dealloc(&ctx.writer);
14623 if (ctx.args_owned) {
14624 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014625 }
14626 return NULL;
14627}
14628
Jeremy Hylton938ace62002-07-17 16:30:39 +000014629static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014630unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14631
Tim Peters6d6c1a32001-08-02 04:15:00 +000014632static PyObject *
14633unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14634{
Benjamin Peterson29060642009-01-31 22:14:21 +000014635 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014636 static char *kwlist[] = {"object", "encoding", "errors", 0};
14637 char *encoding = NULL;
14638 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014639
Benjamin Peterson14339b62009-01-31 16:36:08 +000014640 if (type != &PyUnicode_Type)
14641 return unicode_subtype_new(type, args, kwds);
14642 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014643 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014644 return NULL;
14645 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014646 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014647 if (encoding == NULL && errors == NULL)
14648 return PyObject_Str(x);
14649 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014650 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014651}
14652
Guido van Rossume023fe02001-08-30 03:12:59 +000014653static PyObject *
14654unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14655{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014656 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014657 Py_ssize_t length, char_size;
14658 int share_wstr, share_utf8;
14659 unsigned int kind;
14660 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014661
Benjamin Peterson14339b62009-01-31 16:36:08 +000014662 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014663
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014664 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014665 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014666 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014667 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014668 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014669 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014670 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014671 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014672
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014673 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014674 if (self == NULL) {
14675 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014676 return NULL;
14677 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014678 kind = PyUnicode_KIND(unicode);
14679 length = PyUnicode_GET_LENGTH(unicode);
14680
14681 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014682#ifdef Py_DEBUG
14683 _PyUnicode_HASH(self) = -1;
14684#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014685 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014686#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014687 _PyUnicode_STATE(self).interned = 0;
14688 _PyUnicode_STATE(self).kind = kind;
14689 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014690 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014691 _PyUnicode_STATE(self).ready = 1;
14692 _PyUnicode_WSTR(self) = NULL;
14693 _PyUnicode_UTF8_LENGTH(self) = 0;
14694 _PyUnicode_UTF8(self) = NULL;
14695 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014696 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014697
14698 share_utf8 = 0;
14699 share_wstr = 0;
14700 if (kind == PyUnicode_1BYTE_KIND) {
14701 char_size = 1;
14702 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14703 share_utf8 = 1;
14704 }
14705 else if (kind == PyUnicode_2BYTE_KIND) {
14706 char_size = 2;
14707 if (sizeof(wchar_t) == 2)
14708 share_wstr = 1;
14709 }
14710 else {
14711 assert(kind == PyUnicode_4BYTE_KIND);
14712 char_size = 4;
14713 if (sizeof(wchar_t) == 4)
14714 share_wstr = 1;
14715 }
14716
14717 /* Ensure we won't overflow the length. */
14718 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14719 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014720 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014721 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014722 data = PyObject_MALLOC((length + 1) * char_size);
14723 if (data == NULL) {
14724 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014725 goto onError;
14726 }
14727
Victor Stinnerc3c74152011-10-02 20:39:55 +020014728 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014729 if (share_utf8) {
14730 _PyUnicode_UTF8_LENGTH(self) = length;
14731 _PyUnicode_UTF8(self) = data;
14732 }
14733 if (share_wstr) {
14734 _PyUnicode_WSTR_LENGTH(self) = length;
14735 _PyUnicode_WSTR(self) = (wchar_t *)data;
14736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014737
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014738 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014739 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014740 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014741#ifdef Py_DEBUG
14742 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14743#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014744 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014745 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014746
14747onError:
14748 Py_DECREF(unicode);
14749 Py_DECREF(self);
14750 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014751}
14752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014753PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014754"str(object='') -> str\n\
14755str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014756\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014757Create a new string object from the given object. If encoding or\n\
14758errors is specified, then the object must expose a data buffer\n\
14759that will be decoded using the given encoding and error handler.\n\
14760Otherwise, returns the result of object.__str__() (if defined)\n\
14761or repr(object).\n\
14762encoding defaults to sys.getdefaultencoding().\n\
14763errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014764
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014765static PyObject *unicode_iter(PyObject *seq);
14766
Guido van Rossumd57fd912000-03-10 22:53:23 +000014767PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014768 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014769 "str", /* tp_name */
14770 sizeof(PyUnicodeObject), /* tp_size */
14771 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014772 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014773 (destructor)unicode_dealloc, /* tp_dealloc */
14774 0, /* tp_print */
14775 0, /* tp_getattr */
14776 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014777 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014778 unicode_repr, /* tp_repr */
14779 &unicode_as_number, /* tp_as_number */
14780 &unicode_as_sequence, /* tp_as_sequence */
14781 &unicode_as_mapping, /* tp_as_mapping */
14782 (hashfunc) unicode_hash, /* tp_hash*/
14783 0, /* tp_call*/
14784 (reprfunc) unicode_str, /* tp_str */
14785 PyObject_GenericGetAttr, /* tp_getattro */
14786 0, /* tp_setattro */
14787 0, /* tp_as_buffer */
14788 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014789 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014790 unicode_doc, /* tp_doc */
14791 0, /* tp_traverse */
14792 0, /* tp_clear */
14793 PyUnicode_RichCompare, /* tp_richcompare */
14794 0, /* tp_weaklistoffset */
14795 unicode_iter, /* tp_iter */
14796 0, /* tp_iternext */
14797 unicode_methods, /* tp_methods */
14798 0, /* tp_members */
14799 0, /* tp_getset */
14800 &PyBaseObject_Type, /* tp_base */
14801 0, /* tp_dict */
14802 0, /* tp_descr_get */
14803 0, /* tp_descr_set */
14804 0, /* tp_dictoffset */
14805 0, /* tp_init */
14806 0, /* tp_alloc */
14807 unicode_new, /* tp_new */
14808 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014809};
14810
14811/* Initialize the Unicode implementation */
14812
Victor Stinner3a50e702011-10-18 21:21:00 +020014813int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014814{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014815 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014816 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014817 0x000A, /* LINE FEED */
14818 0x000D, /* CARRIAGE RETURN */
14819 0x001C, /* FILE SEPARATOR */
14820 0x001D, /* GROUP SEPARATOR */
14821 0x001E, /* RECORD SEPARATOR */
14822 0x0085, /* NEXT LINE */
14823 0x2028, /* LINE SEPARATOR */
14824 0x2029, /* PARAGRAPH SEPARATOR */
14825 };
14826
Fred Drakee4315f52000-05-09 19:53:39 +000014827 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014828 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014829 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014830 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014831 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014832
Guido van Rossumcacfc072002-05-24 19:01:59 +000014833 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014834 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014835
14836 /* initialize the linebreak bloom filter */
14837 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014838 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014839 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014840
Christian Heimes26532f72013-07-20 14:57:16 +020014841 if (PyType_Ready(&EncodingMapType) < 0)
14842 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014843
Benjamin Petersonc4311282012-10-30 23:21:10 -040014844 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14845 Py_FatalError("Can't initialize field name iterator type");
14846
14847 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14848 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014849
Victor Stinner3a50e702011-10-18 21:21:00 +020014850#ifdef HAVE_MBCS
14851 winver.dwOSVersionInfoSize = sizeof(winver);
14852 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14853 PyErr_SetFromWindowsErr(0);
14854 return -1;
14855 }
14856#endif
14857 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014858}
14859
14860/* Finalize the Unicode implementation */
14861
Christian Heimesa156e092008-02-16 07:38:31 +000014862int
14863PyUnicode_ClearFreeList(void)
14864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014865 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014866}
14867
Guido van Rossumd57fd912000-03-10 22:53:23 +000014868void
Thomas Wouters78890102000-07-22 19:25:51 +000014869_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014870{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014871 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014872
Serhiy Storchaka05997252013-01-26 12:14:02 +020014873 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014874
Serhiy Storchaka05997252013-01-26 12:14:02 +020014875 for (i = 0; i < 256; i++)
14876 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014877 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014878 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014879}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014880
Walter Dörwald16807132007-05-25 13:52:07 +000014881void
14882PyUnicode_InternInPlace(PyObject **p)
14883{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014884 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014885 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014886#ifdef Py_DEBUG
14887 assert(s != NULL);
14888 assert(_PyUnicode_CHECK(s));
14889#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014890 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014891 return;
14892#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014893 /* If it's a subclass, we don't really know what putting
14894 it in the interned dict might do. */
14895 if (!PyUnicode_CheckExact(s))
14896 return;
14897 if (PyUnicode_CHECK_INTERNED(s))
14898 return;
14899 if (interned == NULL) {
14900 interned = PyDict_New();
14901 if (interned == NULL) {
14902 PyErr_Clear(); /* Don't leave an exception */
14903 return;
14904 }
14905 }
14906 /* It might be that the GetItem call fails even
14907 though the key is present in the dictionary,
14908 namely when this happens during a stack overflow. */
14909 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014910 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014911 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014912
Victor Stinnerf0335102013-04-14 19:13:03 +020014913 if (t) {
14914 Py_INCREF(t);
14915 Py_DECREF(*p);
14916 *p = t;
14917 return;
14918 }
Walter Dörwald16807132007-05-25 13:52:07 +000014919
Benjamin Peterson14339b62009-01-31 16:36:08 +000014920 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014921 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014922 PyErr_Clear();
14923 PyThreadState_GET()->recursion_critical = 0;
14924 return;
14925 }
14926 PyThreadState_GET()->recursion_critical = 0;
14927 /* The two references in interned are not counted by refcnt.
14928 The deallocator will take care of this */
14929 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014930 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014931}
14932
14933void
14934PyUnicode_InternImmortal(PyObject **p)
14935{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014936 PyUnicode_InternInPlace(p);
14937 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014938 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014939 Py_INCREF(*p);
14940 }
Walter Dörwald16807132007-05-25 13:52:07 +000014941}
14942
14943PyObject *
14944PyUnicode_InternFromString(const char *cp)
14945{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014946 PyObject *s = PyUnicode_FromString(cp);
14947 if (s == NULL)
14948 return NULL;
14949 PyUnicode_InternInPlace(&s);
14950 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014951}
14952
Alexander Belopolsky40018472011-02-26 01:02:56 +000014953void
14954_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014955{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014956 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014957 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014958 Py_ssize_t i, n;
14959 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014960
Benjamin Peterson14339b62009-01-31 16:36:08 +000014961 if (interned == NULL || !PyDict_Check(interned))
14962 return;
14963 keys = PyDict_Keys(interned);
14964 if (keys == NULL || !PyList_Check(keys)) {
14965 PyErr_Clear();
14966 return;
14967 }
Walter Dörwald16807132007-05-25 13:52:07 +000014968
Benjamin Peterson14339b62009-01-31 16:36:08 +000014969 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14970 detector, interned unicode strings are not forcibly deallocated;
14971 rather, we give them their stolen references back, and then clear
14972 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014973
Benjamin Peterson14339b62009-01-31 16:36:08 +000014974 n = PyList_GET_SIZE(keys);
14975 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014976 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014977 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014978 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014979 if (PyUnicode_READY(s) == -1) {
14980 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014981 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014983 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014984 case SSTATE_NOT_INTERNED:
14985 /* XXX Shouldn't happen */
14986 break;
14987 case SSTATE_INTERNED_IMMORTAL:
14988 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014989 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014990 break;
14991 case SSTATE_INTERNED_MORTAL:
14992 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014993 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014994 break;
14995 default:
14996 Py_FatalError("Inconsistent interned string state.");
14997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014998 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014999 }
15000 fprintf(stderr, "total size of all interned strings: "
15001 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15002 "mortal/immortal\n", mortal_size, immortal_size);
15003 Py_DECREF(keys);
15004 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015005 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015006}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015007
15008
15009/********************* Unicode Iterator **************************/
15010
15011typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015012 PyObject_HEAD
15013 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015014 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015015} unicodeiterobject;
15016
15017static void
15018unicodeiter_dealloc(unicodeiterobject *it)
15019{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015020 _PyObject_GC_UNTRACK(it);
15021 Py_XDECREF(it->it_seq);
15022 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015023}
15024
15025static int
15026unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15027{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015028 Py_VISIT(it->it_seq);
15029 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015030}
15031
15032static PyObject *
15033unicodeiter_next(unicodeiterobject *it)
15034{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015035 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015036
Benjamin Peterson14339b62009-01-31 16:36:08 +000015037 assert(it != NULL);
15038 seq = it->it_seq;
15039 if (seq == NULL)
15040 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015041 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015043 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15044 int kind = PyUnicode_KIND(seq);
15045 void *data = PyUnicode_DATA(seq);
15046 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15047 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015048 if (item != NULL)
15049 ++it->it_index;
15050 return item;
15051 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015052
Benjamin Peterson14339b62009-01-31 16:36:08 +000015053 Py_DECREF(seq);
15054 it->it_seq = NULL;
15055 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015056}
15057
15058static PyObject *
15059unicodeiter_len(unicodeiterobject *it)
15060{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015061 Py_ssize_t len = 0;
15062 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015063 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015064 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015065}
15066
15067PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15068
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015069static PyObject *
15070unicodeiter_reduce(unicodeiterobject *it)
15071{
15072 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015073 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015074 it->it_seq, it->it_index);
15075 } else {
15076 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15077 if (u == NULL)
15078 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015079 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015080 }
15081}
15082
15083PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15084
15085static PyObject *
15086unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15087{
15088 Py_ssize_t index = PyLong_AsSsize_t(state);
15089 if (index == -1 && PyErr_Occurred())
15090 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015091 if (it->it_seq != NULL) {
15092 if (index < 0)
15093 index = 0;
15094 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15095 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15096 it->it_index = index;
15097 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015098 Py_RETURN_NONE;
15099}
15100
15101PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15102
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015103static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015104 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015105 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015106 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15107 reduce_doc},
15108 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15109 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015110 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015111};
15112
15113PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015114 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15115 "str_iterator", /* tp_name */
15116 sizeof(unicodeiterobject), /* tp_basicsize */
15117 0, /* tp_itemsize */
15118 /* methods */
15119 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15120 0, /* tp_print */
15121 0, /* tp_getattr */
15122 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015123 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015124 0, /* tp_repr */
15125 0, /* tp_as_number */
15126 0, /* tp_as_sequence */
15127 0, /* tp_as_mapping */
15128 0, /* tp_hash */
15129 0, /* tp_call */
15130 0, /* tp_str */
15131 PyObject_GenericGetAttr, /* tp_getattro */
15132 0, /* tp_setattro */
15133 0, /* tp_as_buffer */
15134 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15135 0, /* tp_doc */
15136 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15137 0, /* tp_clear */
15138 0, /* tp_richcompare */
15139 0, /* tp_weaklistoffset */
15140 PyObject_SelfIter, /* tp_iter */
15141 (iternextfunc)unicodeiter_next, /* tp_iternext */
15142 unicodeiter_methods, /* tp_methods */
15143 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015144};
15145
15146static PyObject *
15147unicode_iter(PyObject *seq)
15148{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015149 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015150
Benjamin Peterson14339b62009-01-31 16:36:08 +000015151 if (!PyUnicode_Check(seq)) {
15152 PyErr_BadInternalCall();
15153 return NULL;
15154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015155 if (PyUnicode_READY(seq) == -1)
15156 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015157 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15158 if (it == NULL)
15159 return NULL;
15160 it->it_index = 0;
15161 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015162 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015163 _PyObject_GC_TRACK(it);
15164 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015165}
15166
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015167
15168size_t
15169Py_UNICODE_strlen(const Py_UNICODE *u)
15170{
15171 int res = 0;
15172 while(*u++)
15173 res++;
15174 return res;
15175}
15176
15177Py_UNICODE*
15178Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15179{
15180 Py_UNICODE *u = s1;
15181 while ((*u++ = *s2++));
15182 return s1;
15183}
15184
15185Py_UNICODE*
15186Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15187{
15188 Py_UNICODE *u = s1;
15189 while ((*u++ = *s2++))
15190 if (n-- == 0)
15191 break;
15192 return s1;
15193}
15194
15195Py_UNICODE*
15196Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15197{
15198 Py_UNICODE *u1 = s1;
15199 u1 += Py_UNICODE_strlen(u1);
15200 Py_UNICODE_strcpy(u1, s2);
15201 return s1;
15202}
15203
15204int
15205Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15206{
15207 while (*s1 && *s2 && *s1 == *s2)
15208 s1++, s2++;
15209 if (*s1 && *s2)
15210 return (*s1 < *s2) ? -1 : +1;
15211 if (*s1)
15212 return 1;
15213 if (*s2)
15214 return -1;
15215 return 0;
15216}
15217
15218int
15219Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15220{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015221 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015222 for (; n != 0; n--) {
15223 u1 = *s1;
15224 u2 = *s2;
15225 if (u1 != u2)
15226 return (u1 < u2) ? -1 : +1;
15227 if (u1 == '\0')
15228 return 0;
15229 s1++;
15230 s2++;
15231 }
15232 return 0;
15233}
15234
15235Py_UNICODE*
15236Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15237{
15238 const Py_UNICODE *p;
15239 for (p = s; *p; p++)
15240 if (*p == c)
15241 return (Py_UNICODE*)p;
15242 return NULL;
15243}
15244
15245Py_UNICODE*
15246Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15247{
15248 const Py_UNICODE *p;
15249 p = s + Py_UNICODE_strlen(s);
15250 while (p != s) {
15251 p--;
15252 if (*p == c)
15253 return (Py_UNICODE*)p;
15254 }
15255 return NULL;
15256}
Victor Stinner331ea922010-08-10 16:37:20 +000015257
Victor Stinner71133ff2010-09-01 23:43:53 +000015258Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015259PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015260{
Victor Stinner577db2c2011-10-11 22:12:48 +020015261 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015262 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015264 if (!PyUnicode_Check(unicode)) {
15265 PyErr_BadArgument();
15266 return NULL;
15267 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015268 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015269 if (u == NULL)
15270 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015271 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015272 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015273 PyErr_NoMemory();
15274 return NULL;
15275 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015276 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015277 size *= sizeof(Py_UNICODE);
15278 copy = PyMem_Malloc(size);
15279 if (copy == NULL) {
15280 PyErr_NoMemory();
15281 return NULL;
15282 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015283 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015284 return copy;
15285}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015286
Georg Brandl66c221e2010-10-14 07:04:07 +000015287/* A _string module, to export formatter_parser and formatter_field_name_split
15288 to the string.Formatter class implemented in Python. */
15289
15290static PyMethodDef _string_methods[] = {
15291 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15292 METH_O, PyDoc_STR("split the argument as a field name")},
15293 {"formatter_parser", (PyCFunction) formatter_parser,
15294 METH_O, PyDoc_STR("parse the argument as a format string")},
15295 {NULL, NULL}
15296};
15297
15298static struct PyModuleDef _string_module = {
15299 PyModuleDef_HEAD_INIT,
15300 "_string",
15301 PyDoc_STR("string helper module"),
15302 0,
15303 _string_methods,
15304 NULL,
15305 NULL,
15306 NULL,
15307 NULL
15308};
15309
15310PyMODINIT_FUNC
15311PyInit__string(void)
15312{
15313 return PyModule_Create(&_string_module);
15314}
15315
15316
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015317#ifdef __cplusplus
15318}
15319#endif